From dd38ad4f92456cab144f7ec93c371fb25bacdb13 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 23 Oct 2025 14:26:39 +0200 Subject: [PATCH 01/53] add attrs accesor --- src/spatialdata/_io/accessor.py | 77 +++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 src/spatialdata/_io/accessor.py diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py new file mode 100644 index 000000000..845ae03ff --- /dev/null +++ b/src/spatialdata/_io/accessor.py @@ -0,0 +1,77 @@ +from typing import Any + +import dask.dataframe as dd +from dask.dataframe.extensions import register_dataframe_accessor, register_series_accessor + + +@register_dataframe_accessor("attrs") +class DfAttrsAccessor: + def __init__(self, dask_obj): + self._obj = dask_obj + if not hasattr(dask_obj, "_attrs"): + dask_obj._attrs = {} + + def __getitem__(self, key): + return self._obj._attrs[key] + + def __setitem__(self, key, value): + self._obj._attrs[key] = value + + def __iter__(self): + return iter(self._obj._attrs) + + def __repr__(self): + return repr(self._obj._attrs) + + +@register_series_accessor("attrs") +class SeriesAttrsAccessor: + def __init__(self, dask_obj): + self._obj = dask_obj + if not hasattr(dask_obj, "_attrs"): + dask_obj._attrs = {} + + def __getitem__(self, key): + return self._obj._attrs[key] + + def __setitem__(self, key, value): + self._obj._attrs[key] = value + + def __iter__(self): + return iter(self._obj._attrs) + + def __repr__(self): + return repr(self._obj._attrs) + + +def wrap_with_attrs(method: Any): + """Wrap a Dask DataFrame method to preserve _attrs. + + Copies _attrs from self before calling method, then assigns to result. + Safe for lazy operations like set_index, assign, map_partitions. + """ + + def wrapper(self, *args, **kwargs): + old_attrs = getattr(self, "_attrs", {}).copy() + result = method(self, *args, **kwargs) + result.attrs = old_attrs + return result + + return wrapper + + +methods_to_wrap = [ + "set_index", + "compute", + "drop", + # "assign", + # "map_partitions", + # "merge", + # "join", + # "repartition", +] + +for method_name in methods_to_wrap: + if hasattr(dd.DataFrame, method_name): + original_method = getattr(dd.DataFrame, method_name) + setattr(dd.DataFrame, method_name, wrap_with_attrs(original_method)) From 4feb491468ffbfbbe743f2237b87aee35409e475 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 23 Oct 2025 14:28:01 +0200 Subject: [PATCH 02/53] change deprecated Index access --- src/spatialdata/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/spatialdata/datasets.py b/src/spatialdata/datasets.py index 63c137cdc..ea38d739b 100644 --- a/src/spatialdata/datasets.py +++ b/src/spatialdata/datasets.py @@ -365,7 +365,7 @@ def blobs_annotating_element(name: BlobsTypes) -> SpatialData: instance_id = get_element_instances(sdata[name]).tolist() else: index = sdata[name].index - instance_id = index.compute().tolist() if isinstance(index, dask.dataframe.core.Index) else index.tolist() + instance_id = index.compute().tolist() if isinstance(index, dask.dataframe.Index) else index.tolist() n = len(instance_id) new_table = AnnData(shape=(n, 0), obs={"region": pd.Categorical([name] * n), "instance_id": instance_id}) new_table = TableModel.parse(new_table, region=name, region_key="region", instance_key="instance_id") From 1c042ea8f078ea69142fbb8e3704785bb122fbdc Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 23 Oct 2025 14:30:07 +0200 Subject: [PATCH 03/53] add accessor to init --- src/spatialdata/__init__.py | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/src/spatialdata/__init__.py b/src/spatialdata/__init__.py index 5d84e172b..a07957378 100644 --- a/src/spatialdata/__init__.py +++ b/src/spatialdata/__init__.py @@ -1,19 +1,6 @@ -import dask - -dask.config.set({"dataframe.query-planning": False}) -import dask.dataframe as dd - -# Setting `dataframe.query-planning` to False is effective only if run before `dask.dataframe` is initialized. In -# the case in which the user had initilized `dask.dataframe` before, we would have DASK_EXPER_ENABLED set to `True`. -# Here we check that this does not happen. -if hasattr(dd, "DASK_EXPR_ENABLED") and dd.DASK_EXPR_ENABLED: - raise RuntimeError( - "Unsupported backend: dask-expr has been detected as the backend of dask.dataframe. Please " - "use:\nimport dask\ndask.config.set({'dataframe.query-planning': False})\nbefore importing " - "dask.dataframe to disable dask-expr. The support is being worked on, for more information please see" - "https://github.com/scverse/spatialdata/pull/570" - ) from importlib.metadata import version +# TODO chance * import +from spatialdata._io.accessor import * __version__ = version("spatialdata") From b733de293386c4c12b6e7b5c9a3935e493b6b0ce Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 23 Oct 2025 14:30:47 +0200 Subject: [PATCH 04/53] remove query planning --- tests/conftest.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 775721253..c931b3f88 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,3 @@ -import dask - -dask.config.set({"dataframe.query-planning": False}) from collections.abc import Sequence from pathlib import Path from typing import Any From e53e2158441b9018c5efb0c754ee8bd45e5e7c6a Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sat, 25 Oct 2025 21:23:53 +0200 Subject: [PATCH 05/53] additional changes to accessor --- src/spatialdata/_core/_deepcopy.py | 6 +- src/spatialdata/_core/operations/transform.py | 10 +- src/spatialdata/_core/query/spatial_query.py | 4 +- src/spatialdata/_io/accessor.py | 124 +++++++++++++++--- src/spatialdata/transformations/_utils.py | 2 +- src/spatialdata/transformations/operations.py | 2 + 6 files changed, 123 insertions(+), 25 deletions(-) diff --git a/src/spatialdata/_core/_deepcopy.py b/src/spatialdata/_core/_deepcopy.py index 6a5b43367..8c5356b24 100644 --- a/src/spatialdata/_core/_deepcopy.py +++ b/src/spatialdata/_core/_deepcopy.py @@ -94,9 +94,11 @@ def _(gdf: GeoDataFrame) -> GeoDataFrame: @deepcopy.register(DaskDataFrame) def _(df: DaskDataFrame) -> DaskDataFrame: # bug: the parser may change the order of the columns - new_ddf = PointsModel.parse(df.compute().copy(deep=True)) + compute_df = df.compute().copy(deep=True) + new_ddf = PointsModel.parse(compute_df) # the problem is not .copy(deep=True), but the parser, which discards some metadata https://github.com/scverse/spatialdata/issues/503#issuecomment-2015275322 - new_ddf.attrs = _deepcopy(df.attrs) + # We need to use the compute_df here as with deepcopy, df._attrs does not exist anymore. + new_ddf.attrs = _deepcopy(compute_df.attrs) return new_ddf diff --git a/src/spatialdata/_core/operations/transform.py b/src/spatialdata/_core/operations/transform.py index da56dc391..f2ed03f89 100644 --- a/src/spatialdata/_core/operations/transform.py +++ b/src/spatialdata/_core/operations/transform.py @@ -8,6 +8,7 @@ import dask.array as da import dask_image.ndinterp import numpy as np +import pandas as pd from dask.array.core import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -432,7 +433,8 @@ def _( xtransformed = transformation._transform_coordinates(xdata) transformed = data.drop(columns=list(axes)).copy() # dummy transformation that will be replaced by _adjust_transformation() - transformed.attrs[TRANSFORM_KEY] = {DEFAULT_COORDINATE_SYSTEM: Identity()} + default_cs = {DEFAULT_COORDINATE_SYSTEM: Identity()} + transformed.attrs[TRANSFORM_KEY] = default_cs # TODO: the following line, used in place of the line before, leads to an incorrect aggregation result. Look into # this! Reported here: ... # transformed.attrs = {TRANSFORM_KEY: {DEFAULT_COORDINATE_SYSTEM: Identity()}} @@ -440,7 +442,11 @@ def _( for ax in axes: indices = xtransformed["dim"] == ax new_ax = xtransformed[:, indices] - transformed[ax] = new_ax.data.flatten() + # TODO: discuss with dask team + # This is not nice, but otherwise there is a problem with the joint graph of new_ax and transformed, causing + # a getattr missing dependency of dependent from_dask_array. + new_col = pd.Series(new_ax.data.flatten().compute(), index=transformed.index) + transformed[ax] = new_col old_transformations = get_transformation(data, get_all=True) assert isinstance(old_transformations, dict) diff --git a/src/spatialdata/_core/query/spatial_query.py b/src/spatialdata/_core/query/spatial_query.py index e6dccb458..f30b41c0e 100644 --- a/src/spatialdata/_core/query/spatial_query.py +++ b/src/spatialdata/_core/query/spatial_query.py @@ -672,8 +672,8 @@ def _( max_coordinate=max_coordinate_intrinsic, ) - # assert that the number of bounding boxes is correct - assert len(in_intrinsic_bounding_box) == len(min_coordinate) + if not (len_df := len(in_intrinsic_bounding_box)) == (len_bb := len(min_coordinate)): + raise ValueError(f"Number of dataframes `{len_df}` is not equal to the number of bounding boxes `{len_bb}`.") points_in_intrinsic_bounding_box: list[DaskDataFrame | None] = [] for mask in in_intrinsic_bounding_box: if mask.sum() == 0: diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 845ae03ff..ffd75cf29 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -1,11 +1,13 @@ +from collections.abc import MutableMapping from typing import Any import dask.dataframe as dd from dask.dataframe.extensions import register_dataframe_accessor, register_series_accessor -@register_dataframe_accessor("attrs") -class DfAttrsAccessor: +class _AttrsBase(MutableMapping): + """Base accessor that stores arbitrary metadata on Dask objects.""" + def __init__(self, dask_obj): self._obj = dask_obj if not hasattr(dask_obj, "_attrs"): @@ -17,31 +19,42 @@ def __getitem__(self, key): def __setitem__(self, key, value): self._obj._attrs[key] = value + def __delitem__(self, key): + del self._obj._attrs[key] + def __iter__(self): return iter(self._obj._attrs) + def __len__(self): + return len(self._obj._attrs) + def __repr__(self): return repr(self._obj._attrs) + def __str__(self): + return str(self._obj._attrs) -@register_series_accessor("attrs") -class SeriesAttrsAccessor: - def __init__(self, dask_obj): - self._obj = dask_obj - if not hasattr(dask_obj, "_attrs"): - dask_obj._attrs = {} + def copy(self): + return self._obj._attrs.copy() - def __getitem__(self, key): - return self._obj._attrs[key] + @property + def data(self): + """Access the raw internal attrs dict.""" + return self._obj._attrs - def __setitem__(self, key, value): - self._obj._attrs[key] = value - def __iter__(self): - return iter(self._obj._attrs) +@register_dataframe_accessor("attrs") +class DfAttrsAccessor(_AttrsBase): + """Dict-like .attrs accessor for Dask DataFrames.""" - def __repr__(self): - return repr(self._obj._attrs) + pass + + +@register_series_accessor("attrs") +class SeriesAttrsAccessor(_AttrsBase): + """Dict-like .attrs accessor for Dask Series.""" + + pass def wrap_with_attrs(method: Any): @@ -52,7 +65,13 @@ def wrap_with_attrs(method: Any): """ def wrapper(self, *args, **kwargs): - old_attrs = getattr(self, "_attrs", {}).copy() + old_accessor = getattr(self, "attrs", {}) + if hasattr(old_accessor, "_obj") and hasattr(old_accessor._obj, "_attrs"): + old_attrs = old_accessor._obj._attrs.copy() + elif isinstance(old_accessor, dict): + old_attrs = old_accessor.copy() + else: + old_attrs = {} result = method(self, *args, **kwargs) result.attrs = old_attrs return result @@ -60,12 +79,78 @@ def wrapper(self, *args, **kwargs): return wrapper +def wrap_loc_with_attrs(): + """Patch dd.DataFrame.loc to preserve _attrs.""" + original_property = dd.DataFrame.loc # this is a property + + @property + def loc_with_attrs(self): + df = self + loc = original_property.fget(df) + + class LocWrapper: + def __init__(self, parent_loc, parent_df): + self._parent_loc = parent_loc + self._parent_df = parent_df + + def __getitem__(self, key): + result = self._parent_loc[key] + if hasattr(self._parent_df, "_attrs"): + result.attrs = self._parent_df._attrs.copy() + return result + + def __setitem__(self, key, value): + # preserve attrs even if user assigns via .loc + self._parent_loc[key] = value + return self._parent_df + + def __repr__(self): + return repr(self._parent_loc) + + return LocWrapper(loc, df) + + dd.DataFrame.loc = loc_with_attrs + + +def wrap_iloc_with_attrs(): + """Patch dd.DataFrame.iloc to preserve _attrs.""" + original_property = dd.DataFrame.iloc # this is a property + + @property + def iloc_with_attrs(self): + df = self + iloc = original_property.fget(df) + + class ILocWrapper: + def __init__(self, parent_iloc, parent_df): + self._parent_iloc = parent_iloc + self._parent_df = parent_df + + def __getitem__(self, key): + result = self._parent_iloc[key] + if hasattr(self._parent_df, "_attrs"): + result.attrs = self._parent_df._attrs.copy() + return result + + def __setitem__(self, key, value): + self._parent_iloc[key] = value + return self._parent_df + + def __repr__(self): + return repr(self._parent_iloc) + + return ILocWrapper(iloc, df) + + dd.DataFrame.iloc = iloc_with_attrs + + methods_to_wrap = [ "set_index", "compute", "drop", + "__getitem__", # "assign", - # "map_partitions", + "map_partitions", # "merge", # "join", # "repartition", @@ -75,3 +160,6 @@ def wrapper(self, *args, **kwargs): if hasattr(dd.DataFrame, method_name): original_method = getattr(dd.DataFrame, method_name) setattr(dd.DataFrame, method_name, wrap_with_attrs(original_method)) + +wrap_loc_with_attrs() +wrap_iloc_with_attrs() diff --git a/src/spatialdata/transformations/_utils.py b/src/spatialdata/transformations/_utils.py index 44f9998b9..6089fca82 100644 --- a/src/spatialdata/transformations/_utils.py +++ b/src/spatialdata/transformations/_utils.py @@ -55,7 +55,7 @@ def _set_transformations_to_element(element: Any, transformations: MappingToCoor attrs[TRANSFORM_KEY] = {} attrs[TRANSFORM_KEY] = transformations # this calls an eventual setter in the element class; modifying the attrs directly would not trigger the setter - element.attrs = attrs + # element.attrs = attrs @singledispatch diff --git a/src/spatialdata/transformations/operations.py b/src/spatialdata/transformations/operations.py index 15fe3d57c..8cfc9fc4f 100644 --- a/src/spatialdata/transformations/operations.py +++ b/src/spatialdata/transformations/operations.py @@ -115,6 +115,8 @@ def get_transformation( from spatialdata.models._utils import DEFAULT_COORDINATE_SYSTEM transformations = _get_transformations(element) + # if not isinstance(transformations, dict): + # print() assert isinstance(transformations, dict) if get_all is False: From 3ae1d29357c9211acb7e554a387526caf6fc8b80 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sat, 25 Oct 2025 22:25:55 +0200 Subject: [PATCH 06/53] divisions is not settable anymore --- src/spatialdata/models/models.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/spatialdata/models/models.py b/src/spatialdata/models/models.py index 60f4ee205..bed33ff1d 100644 --- a/src/spatialdata/models/models.py +++ b/src/spatialdata/models/models.py @@ -808,9 +808,7 @@ def _( sort=sort, **kwargs, ) - # we cannot compute the divisions whne the index is not monotonically increasing and npartitions > 1 - if not table.known_divisions and (sort or table.npartitions == 1): - table.divisions = table.compute_current_divisions() + # TODO: dask does not allow for setting divisions directly anymore. We have to decide on forcing the user. if feature_key is not None: feature_categ = dd.from_pandas( data[feature_key].astype(str).astype("category"), From 19684fb04ac523234ff5000a14bfa0fcee53e427 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 27 Oct 2025 17:12:31 +0100 Subject: [PATCH 07/53] add fixes --- src/spatialdata/_core/operations/rasterize.py | 4 ++++ src/spatialdata/_io/_utils.py | 14 ++++++++++++-- src/spatialdata/_io/accessor.py | 3 ++- tests/io/test_readwrite.py | 3 +-- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/spatialdata/_core/operations/rasterize.py b/src/spatialdata/_core/operations/rasterize.py index a075aeb38..4e060f2db 100644 --- a/src/spatialdata/_core/operations/rasterize.py +++ b/src/spatialdata/_core/operations/rasterize.py @@ -666,6 +666,10 @@ def rasterize_shapes_points( label_index_to_category = None if VALUES_COLUMN in data and data[VALUES_COLUMN].dtype == "category": if isinstance(data, DaskDataFrame): + # We have to do this because as_known() does not preserve the order anymore in latest dask versions + # TODO discuss whether we can always expect the index from before to be monotonically increasing, because + # then we don't have to check order + # data[VALUES_COLUMN] = data[VALUES_COLUMN].cat.set_categories(data.index, ordered=True) data[VALUES_COLUMN] = data[VALUES_COLUMN].cat.as_known() label_index_to_category = dict(enumerate(data[VALUES_COLUMN].cat.categories, start=1)) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 20c236275..68a1a02de 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -337,15 +337,25 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No ) parquet_file = t[0] files.append(str(UPath(parquet_file).resolve())) - elif isinstance(v, tuple) and len(v) > 1 and isinstance(v[1], dict) and "piece" in v[1]: + elif "piece" in v.args[0]: # https://github.com/dask/dask/blob/ff2488aec44d641696e0b7aa41ed9e995c710705/dask/dataframe/io/parquet/core.py#L870 - parquet_file, check0, check1 = v[1]["piece"] + parquet_file, check0, check1 = v.args[0]["piece"] if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: raise ValueError( f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " f"report this bug." ) files.append(os.path.realpath(parquet_file)) + else: + for task in v.args[0].value: + if isinstance(task.args[0], dict) and "piece" in task.args[0]: + parquet_file, check0, check1 = task.args[0]["piece"] + if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: + raise ValueError( + f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " + f"report this bug." + ) + files.append(os.path.realpath(parquet_file)) def _backed_elements_contained_in_path(path: Path, object: SpatialData | SpatialElement | AnnData) -> list[bool]: diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index ffd75cf29..750eaa0da 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -149,7 +149,8 @@ def __repr__(self): "compute", "drop", "__getitem__", - # "assign", + "copy", + "cat", "map_partitions", # "merge", # "join", diff --git a/tests/io/test_readwrite.py b/tests/io/test_readwrite.py index 8501687ca..6e948f519 100644 --- a/tests/io/test_readwrite.py +++ b/tests/io/test_readwrite.py @@ -229,8 +229,7 @@ def _workaround1_dask_backed( del sdata[new_name] sdata.delete_element_from_disk(new_name) - # @pytest.mark.parametrize("dask_backed", [True, False]) - @pytest.mark.parametrize("dask_backed", [True]) + @pytest.mark.parametrize("dask_backed", [True, False]) @pytest.mark.parametrize("workaround", [1, 2]) def test_incremental_io_on_disk( self, From 51b733e931cc128c7b91b92e1be4f1222ba87ca8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 16:14:44 +0000 Subject: [PATCH 08/53] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/spatialdata/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/spatialdata/__init__.py b/src/spatialdata/__init__.py index a07957378..dbe405526 100644 --- a/src/spatialdata/__init__.py +++ b/src/spatialdata/__init__.py @@ -1,4 +1,5 @@ from importlib.metadata import version + # TODO chance * import from spatialdata._io.accessor import * From 88fe00325fa14111f0c3fb7c662c8d799cd90b9f Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Tue, 28 Oct 2025 14:15:38 +0100 Subject: [PATCH 09/53] fix rasterize points --- src/spatialdata/_core/operations/rasterize.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_core/operations/rasterize.py b/src/spatialdata/_core/operations/rasterize.py index 4e060f2db..6da0a7cc8 100644 --- a/src/spatialdata/_core/operations/rasterize.py +++ b/src/spatialdata/_core/operations/rasterize.py @@ -653,12 +653,14 @@ def rasterize_shapes_points( table_name = table_name if table_name is not None else "table" + index = False if value_key is not None: kwargs = {"sdata": sdata, "element_name": element_name} if element_name is not None else {"element": data} data[VALUES_COLUMN] = get_values(value_key, table_name=table_name, **kwargs).iloc[:, 0] # type: ignore[arg-type, union-attr] elif isinstance(data, GeoDataFrame) or isinstance(data, DaskDataFrame) and return_regions_as_labels is True: value_key = VALUES_COLUMN data[VALUES_COLUMN] = data.index.astype("category") + index = True else: value_key = VALUES_COLUMN data[VALUES_COLUMN] = 1 @@ -668,9 +670,11 @@ def rasterize_shapes_points( if isinstance(data, DaskDataFrame): # We have to do this because as_known() does not preserve the order anymore in latest dask versions # TODO discuss whether we can always expect the index from before to be monotonically increasing, because - # then we don't have to check order - # data[VALUES_COLUMN] = data[VALUES_COLUMN].cat.set_categories(data.index, ordered=True) - data[VALUES_COLUMN] = data[VALUES_COLUMN].cat.as_known() + # then we don't have to check order. + if index: + data[VALUES_COLUMN] = data[VALUES_COLUMN].cat.set_categories(data.index, ordered=True) + else: + data[VALUES_COLUMN] = data[VALUES_COLUMN].cat.as_known() label_index_to_category = dict(enumerate(data[VALUES_COLUMN].cat.categories, start=1)) if return_single_channel is None: From 239f6937caf682eeeb5feef900a5bc2a4676ce6e Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Tue, 28 Oct 2025 20:14:38 +0100 Subject: [PATCH 10/53] copy partitioned attrs --- src/spatialdata/_io/accessor.py | 4 ++++ tests/core/query/test_spatial_query.py | 8 +++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 750eaa0da..554db3c55 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -97,6 +97,8 @@ def __getitem__(self, key): result = self._parent_loc[key] if hasattr(self._parent_df, "_attrs"): result.attrs = self._parent_df._attrs.copy() + if hasattr(self._parent_df, "attrs"): + result.attrs = self._parent_df.attrs.copy() return result def __setitem__(self, key, value): @@ -130,6 +132,8 @@ def __getitem__(self, key): result = self._parent_iloc[key] if hasattr(self._parent_df, "_attrs"): result.attrs = self._parent_df._attrs.copy() + if hasattr(self._parent_df, "attrs"): + result.attrs = self._parent_df.attrs.copy() return result def __setitem__(self, key, value): diff --git a/tests/core/query/test_spatial_query.py b/tests/core/query/test_spatial_query.py index fc59d0698..96cc9f42c 100644 --- a/tests/core/query/test_spatial_query.py +++ b/tests/core/query/test_spatial_query.py @@ -645,10 +645,13 @@ def _query( geopandas.testing.assert_geodataframe_equal(queried[name], queried_back[name]) -@pytest.mark.parametrize("with_polygon_query", [True, False]) +@pytest.mark.parametrize("with_polygon_query", [False]) def test_query_points_multiple_partitions(points, with_polygon_query: bool): p0 = points["points_0"] - p1 = PointsModel.parse(dd.from_pandas(p0.compute(), npartitions=10)) + attrs = p0.attrs.copy() + ddf = dd.from_pandas(p0.compute(), npartitions=10) + ddf.attrs = attrs + p1 = PointsModel.parse(ddf) def _query(p: DaskDataFrame) -> DaskDataFrame: if with_polygon_query: @@ -669,7 +672,6 @@ def _query(p: DaskDataFrame) -> DaskDataFrame: q0 = _query(p0) q1 = _query(p1) assert np.array_equal(q0.index.compute(), q1.index.compute()) - pass @pytest.mark.parametrize("with_polygon_query", [True, False]) From afad6bdb75df8a57a55c317942bd317b3fb84c6e Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 30 Oct 2025 00:24:56 +0100 Subject: [PATCH 11/53] fix mypy --- src/spatialdata/_io/accessor.py | 58 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 554db3c55..c190ec61c 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -1,44 +1,44 @@ -from collections.abc import MutableMapping +from collections.abc import Callable, Iterator, MutableMapping from typing import Any import dask.dataframe as dd from dask.dataframe.extensions import register_dataframe_accessor, register_series_accessor -class _AttrsBase(MutableMapping): +class _AttrsBase(MutableMapping[str, str | dict[str, str]]): """Base accessor that stores arbitrary metadata on Dask objects.""" - def __init__(self, dask_obj): + def __init__(self, dask_obj: dd.DataFrame | dd.Series): self._obj = dask_obj if not hasattr(dask_obj, "_attrs"): dask_obj._attrs = {} - def __getitem__(self, key): + def __getitem__(self, key: str) -> Any: return self._obj._attrs[key] - def __setitem__(self, key, value): + def __setitem__(self, key: str, value: str | dict[str, str]) -> None: self._obj._attrs[key] = value - def __delitem__(self, key): + def __delitem__(self, key: str) -> None: del self._obj._attrs[key] - def __iter__(self): + def __iter__(self) -> Iterator[str]: return iter(self._obj._attrs) - def __len__(self): + def __len__(self) -> int: return len(self._obj._attrs) - def __repr__(self): + def __repr__(self) -> str: return repr(self._obj._attrs) - def __str__(self): + def __str__(self) -> str: return str(self._obj._attrs) - def copy(self): + def copy(self) -> Any: return self._obj._attrs.copy() @property - def data(self): + def data(self) -> Any: """Access the raw internal attrs dict.""" return self._obj._attrs @@ -57,14 +57,14 @@ class SeriesAttrsAccessor(_AttrsBase): pass -def wrap_with_attrs(method: Any): +def wrap_with_attrs(method: Callable[..., Any]) -> Callable[..., Any]: """Wrap a Dask DataFrame method to preserve _attrs. Copies _attrs from self before calling method, then assigns to result. Safe for lazy operations like set_index, assign, map_partitions. """ - def wrapper(self, *args, **kwargs): + def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: old_accessor = getattr(self, "attrs", {}) if hasattr(old_accessor, "_obj") and hasattr(old_accessor._obj, "_attrs"): old_attrs = old_accessor._obj._attrs.copy() @@ -79,21 +79,20 @@ def wrapper(self, *args, **kwargs): return wrapper -def wrap_loc_with_attrs(): +def wrap_loc_with_attrs() -> None: """Patch dd.DataFrame.loc to preserve _attrs.""" original_property = dd.DataFrame.loc # this is a property - @property - def loc_with_attrs(self): + def loc_with_attrs(self: dd.DataFrame) -> Any: df = self loc = original_property.fget(df) class LocWrapper: - def __init__(self, parent_loc, parent_df): + def __init__(self, parent_loc: Any, parent_df: dd.DataFrame) -> None: self._parent_loc = parent_loc self._parent_df = parent_df - def __getitem__(self, key): + def __getitem__(self, key: str) -> Any: result = self._parent_loc[key] if hasattr(self._parent_df, "_attrs"): result.attrs = self._parent_df._attrs.copy() @@ -101,34 +100,33 @@ def __getitem__(self, key): result.attrs = self._parent_df.attrs.copy() return result - def __setitem__(self, key, value): + def __setitem__(self, key: str, value: Any) -> dd.DataFrame: # preserve attrs even if user assigns via .loc self._parent_loc[key] = value return self._parent_df - def __repr__(self): + def __repr__(self) -> str: return repr(self._parent_loc) return LocWrapper(loc, df) - dd.DataFrame.loc = loc_with_attrs + dd.DataFrame.loc = property(loc_with_attrs) -def wrap_iloc_with_attrs(): +def wrap_iloc_with_attrs() -> None: """Patch dd.DataFrame.iloc to preserve _attrs.""" original_property = dd.DataFrame.iloc # this is a property - @property - def iloc_with_attrs(self): + def iloc_with_attrs(self: dd.DataFrame) -> Any: df = self iloc = original_property.fget(df) class ILocWrapper: - def __init__(self, parent_iloc, parent_df): + def __init__(self, parent_iloc: Any, parent_df: dd.DataFrame) -> None: self._parent_iloc = parent_iloc self._parent_df = parent_df - def __getitem__(self, key): + def __getitem__(self, key: str) -> Any: result = self._parent_iloc[key] if hasattr(self._parent_df, "_attrs"): result.attrs = self._parent_df._attrs.copy() @@ -136,16 +134,16 @@ def __getitem__(self, key): result.attrs = self._parent_df.attrs.copy() return result - def __setitem__(self, key, value): + def __setitem__(self, key: str, value: Any) -> dd.DataFrame: self._parent_iloc[key] = value return self._parent_df - def __repr__(self): + def __repr__(self) -> str: return repr(self._parent_iloc) return ILocWrapper(iloc, df) - dd.DataFrame.iloc = iloc_with_attrs + dd.DataFrame.iloc = property(iloc_with_attrs) methods_to_wrap = [ From e017ca7d6107623750196606a07fe8e4407c242f Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 30 Oct 2025 00:41:51 +0100 Subject: [PATCH 12/53] fix last mypy error --- src/spatialdata/__init__.py | 12 ++++++++++-- src/spatialdata/_io/accessor.py | 3 --- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/spatialdata/__init__.py b/src/spatialdata/__init__.py index ac6cf4ee6..dfa6e5d78 100644 --- a/src/spatialdata/__init__.py +++ b/src/spatialdata/__init__.py @@ -1,7 +1,12 @@ from importlib.metadata import version -# TODO change * import -from spatialdata._io.accessor import * +from spatialdata._io.accessor import ( + DfAttrsAccessor, + SeriesAttrsAccessor, + wrap_iloc_with_attrs, + wrap_loc_with_attrs, + wrap_with_attrs, +) __version__ = version("spatialdata") @@ -71,3 +76,6 @@ from spatialdata._io.format import SpatialDataFormatType from spatialdata._io.io_zarr import read_zarr from spatialdata._utils import get_pyramid_levels, unpad_raster + +wrap_loc_with_attrs() +wrap_iloc_with_attrs() diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index c190ec61c..7dc0daef8 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -163,6 +163,3 @@ def __repr__(self) -> str: if hasattr(dd.DataFrame, method_name): original_method = getattr(dd.DataFrame, method_name) setattr(dd.DataFrame, method_name, wrap_with_attrs(original_method)) - -wrap_loc_with_attrs() -wrap_iloc_with_attrs() From d11655aeb979da8f1a278113f23ae6010cce685c Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 30 Oct 2025 10:16:08 +0100 Subject: [PATCH 13/53] Apply suggestion from @melonora --- tests/core/query/test_spatial_query.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/core/query/test_spatial_query.py b/tests/core/query/test_spatial_query.py index 96cc9f42c..214f67d2c 100644 --- a/tests/core/query/test_spatial_query.py +++ b/tests/core/query/test_spatial_query.py @@ -645,7 +645,7 @@ def _query( geopandas.testing.assert_geodataframe_equal(queried[name], queried_back[name]) -@pytest.mark.parametrize("with_polygon_query", [False]) +@pytest.mark.parametrize("with_polygon_query", [True, False]) def test_query_points_multiple_partitions(points, with_polygon_query: bool): p0 = points["points_0"] attrs = p0.attrs.copy() From 8253eb86b3e79738f48898b15710efec90f906b2 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 30 Oct 2025 10:16:20 +0100 Subject: [PATCH 14/53] Apply suggestion from @melonora --- src/spatialdata/transformations/operations.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/spatialdata/transformations/operations.py b/src/spatialdata/transformations/operations.py index 8cfc9fc4f..15fe3d57c 100644 --- a/src/spatialdata/transformations/operations.py +++ b/src/spatialdata/transformations/operations.py @@ -115,8 +115,6 @@ def get_transformation( from spatialdata.models._utils import DEFAULT_COORDINATE_SYSTEM transformations = _get_transformations(element) - # if not isinstance(transformations, dict): - # print() assert isinstance(transformations, dict) if get_all is False: From 078469a59cee3d7ec203eefd4c9c065c84ec82b5 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 30 Oct 2025 10:16:25 +0100 Subject: [PATCH 15/53] Apply suggestion from @melonora --- src/spatialdata/transformations/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/spatialdata/transformations/_utils.py b/src/spatialdata/transformations/_utils.py index 6089fca82..654d2f8b1 100644 --- a/src/spatialdata/transformations/_utils.py +++ b/src/spatialdata/transformations/_utils.py @@ -55,7 +55,6 @@ def _set_transformations_to_element(element: Any, transformations: MappingToCoor attrs[TRANSFORM_KEY] = {} attrs[TRANSFORM_KEY] = transformations # this calls an eventual setter in the element class; modifying the attrs directly would not trigger the setter - # element.attrs = attrs @singledispatch From 65839b4e4f72b45b153ed719bd7c238794ba0161 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Thu, 30 Oct 2025 16:52:09 +0100 Subject: [PATCH 16/53] deduplicate --- src/spatialdata/__init__.py | 7 ++-- src/spatialdata/_io/accessor.py | 58 +++++++-------------------------- 2 files changed, 15 insertions(+), 50 deletions(-) diff --git a/src/spatialdata/__init__.py b/src/spatialdata/__init__.py index dfa6e5d78..febe904f6 100644 --- a/src/spatialdata/__init__.py +++ b/src/spatialdata/__init__.py @@ -3,8 +3,7 @@ from spatialdata._io.accessor import ( DfAttrsAccessor, SeriesAttrsAccessor, - wrap_iloc_with_attrs, - wrap_loc_with_attrs, + wrap_indexer_with_attrs, wrap_with_attrs, ) @@ -77,5 +76,5 @@ from spatialdata._io.io_zarr import read_zarr from spatialdata._utils import get_pyramid_levels, unpad_raster -wrap_loc_with_attrs() -wrap_iloc_with_attrs() +wrap_indexer_with_attrs("loc") +wrap_indexer_with_attrs("iloc") diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 7dc0daef8..2be072ef9 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -1,5 +1,5 @@ from collections.abc import Callable, Iterator, MutableMapping -from typing import Any +from typing import Any, Literal import dask.dataframe as dd from dask.dataframe.extensions import register_dataframe_accessor, register_series_accessor @@ -79,23 +79,25 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: return wrapper -def wrap_loc_with_attrs() -> None: - """Patch dd.DataFrame.loc to preserve _attrs.""" - original_property = dd.DataFrame.loc # this is a property +def wrap_indexer_with_attrs(indexer_name: Literal["loc", "iloc"]) -> None: + """Patch dd.DataFrame.loc or iloc to preserve _attrs. - def loc_with_attrs(self: dd.DataFrame) -> Any: + Reason for having this separate from methods is because both loc and iloc are a property that return an indexer. + Therefore, they have to be wrapped differently from methods in order to preserve attrs. + """ + original_property = getattr(dd.DataFrame, indexer_name) # this is a property + + def indexer_with_attrs(self: dd.DataFrame) -> Any: df = self loc = original_property.fget(df) - class LocWrapper: + class IndexerWrapper: def __init__(self, parent_loc: Any, parent_df: dd.DataFrame) -> None: self._parent_loc = parent_loc self._parent_df = parent_df def __getitem__(self, key: str) -> Any: result = self._parent_loc[key] - if hasattr(self._parent_df, "_attrs"): - result.attrs = self._parent_df._attrs.copy() if hasattr(self._parent_df, "attrs"): result.attrs = self._parent_df.attrs.copy() return result @@ -108,42 +110,9 @@ def __setitem__(self, key: str, value: Any) -> dd.DataFrame: def __repr__(self) -> str: return repr(self._parent_loc) - return LocWrapper(loc, df) - - dd.DataFrame.loc = property(loc_with_attrs) - - -def wrap_iloc_with_attrs() -> None: - """Patch dd.DataFrame.iloc to preserve _attrs.""" - original_property = dd.DataFrame.iloc # this is a property - - def iloc_with_attrs(self: dd.DataFrame) -> Any: - df = self - iloc = original_property.fget(df) - - class ILocWrapper: - def __init__(self, parent_iloc: Any, parent_df: dd.DataFrame) -> None: - self._parent_iloc = parent_iloc - self._parent_df = parent_df - - def __getitem__(self, key: str) -> Any: - result = self._parent_iloc[key] - if hasattr(self._parent_df, "_attrs"): - result.attrs = self._parent_df._attrs.copy() - if hasattr(self._parent_df, "attrs"): - result.attrs = self._parent_df.attrs.copy() - return result - - def __setitem__(self, key: str, value: Any) -> dd.DataFrame: - self._parent_iloc[key] = value - return self._parent_df - - def __repr__(self) -> str: - return repr(self._parent_iloc) - - return ILocWrapper(iloc, df) + return IndexerWrapper(loc, df) - dd.DataFrame.iloc = property(iloc_with_attrs) + setattr(dd.DataFrame, indexer_name, property(indexer_with_attrs)) methods_to_wrap = [ @@ -154,9 +123,6 @@ def __repr__(self) -> str: "copy", "cat", "map_partitions", - # "merge", - # "join", - # "repartition", ] for method_name in methods_to_wrap: From f1fb487276408b0f450df86e7336c44eace8ea2e Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 16:44:41 +0100 Subject: [PATCH 17/53] .attrs is now always an accessor, never a dict --- src/spatialdata/_core/_deepcopy.py | 3 ++- src/spatialdata/_io/accessor.py | 34 +++++++++++++++++++------- tests/core/query/test_spatial_query.py | 3 ++- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/src/spatialdata/_core/_deepcopy.py b/src/spatialdata/_core/_deepcopy.py index 8c5356b24..8b8c0b5ce 100644 --- a/src/spatialdata/_core/_deepcopy.py +++ b/src/spatialdata/_core/_deepcopy.py @@ -98,7 +98,8 @@ def _(df: DaskDataFrame) -> DaskDataFrame: new_ddf = PointsModel.parse(compute_df) # the problem is not .copy(deep=True), but the parser, which discards some metadata https://github.com/scverse/spatialdata/issues/503#issuecomment-2015275322 # We need to use the compute_df here as with deepcopy, df._attrs does not exist anymore. - new_ddf.attrs = _deepcopy(compute_df.attrs) + # print(type(new_ddf.attrs)) + new_ddf.attrs.update(_deepcopy(compute_df.attrs)) return new_ddf diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 2be072ef9..49d3ebf76 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -2,7 +2,10 @@ from typing import Any, Literal import dask.dataframe as dd -from dask.dataframe.extensions import register_dataframe_accessor, register_series_accessor +from dask.dataframe.extensions import ( + register_dataframe_accessor, + register_series_accessor, +) class _AttrsBase(MutableMapping[str, str | dict[str, str]]): @@ -65,15 +68,28 @@ def wrap_with_attrs(method: Callable[..., Any]) -> Callable[..., Any]: """ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: - old_accessor = getattr(self, "attrs", {}) - if hasattr(old_accessor, "_obj") and hasattr(old_accessor._obj, "_attrs"): - old_attrs = old_accessor._obj._attrs.copy() - elif isinstance(old_accessor, dict): - old_attrs = old_accessor.copy() - else: + old_accessor = getattr(self, "attrs") + if isinstance(old_accessor, dict): + raise RuntimeError( + "Invalid type (.attrs is a `dict` and not a , likely due to an invalid assignment: my_dd_object.attrs was overwritten with a dict. " + "Do not assign to 'attrs'. Use my_dd_object.attrs.update(...) instead." + ) + if not hasattr(old_accessor._obj, "_attrs"): old_attrs = {} + else: + old_attrs = old_accessor._obj._attrs.copy() + result = method(self, *args, **kwargs) - result.attrs = old_attrs + # Check if result is a Dask object (has the attrs accessor) vs pandas (plain dict attrs) + if isinstance(result, (dd.DataFrame, dd.Series)): + # Dask DataFrame/Series: initialize _attrs if needed, then assign + result.attrs.update(old_attrs) + # if not hasattr(result, "_attrs"): + # result._attrs = {} + # result._attrs = old_attrs + else: + # Pandas DataFrame/Series: assign to attrs (which is a plain dict attribute) + result.attrs = old_attrs return result return wrapper @@ -99,7 +115,7 @@ def __init__(self, parent_loc: Any, parent_df: dd.DataFrame) -> None: def __getitem__(self, key: str) -> Any: result = self._parent_loc[key] if hasattr(self._parent_df, "attrs"): - result.attrs = self._parent_df.attrs.copy() + result._attrs = self._parent_df.attrs.copy() return result def __setitem__(self, key: str, value: Any) -> dd.DataFrame: diff --git a/tests/core/query/test_spatial_query.py b/tests/core/query/test_spatial_query.py index 214f67d2c..77843f74e 100644 --- a/tests/core/query/test_spatial_query.py +++ b/tests/core/query/test_spatial_query.py @@ -650,7 +650,8 @@ def test_query_points_multiple_partitions(points, with_polygon_query: bool): p0 = points["points_0"] attrs = p0.attrs.copy() ddf = dd.from_pandas(p0.compute(), npartitions=10) - ddf.attrs = attrs + ddf.attrs.update(attrs) + # ddf._attrs = attrs p1 = PointsModel.parse(ddf) def _query(p: DaskDataFrame) -> DaskDataFrame: From d7d0b4d75b61c50c1e12d6b5bb1becf3d95a5b96 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 16:58:13 +0100 Subject: [PATCH 18/53] simplify wrapper logic: --- src/spatialdata/_io/accessor.py | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 49d3ebf76..ec3c66107 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -68,28 +68,17 @@ def wrap_with_attrs(method: Callable[..., Any]) -> Callable[..., Any]: """ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: - old_accessor = getattr(self, "attrs") - if isinstance(old_accessor, dict): + if not isinstance(self.attrs, DfAttrsAccessor | SeriesAttrsAccessor): raise RuntimeError( - "Invalid type (.attrs is a `dict` and not a , likely due to an invalid assignment: my_dd_object.attrs was overwritten with a dict. " - "Do not assign to 'attrs'. Use my_dd_object.attrs.update(...) instead." + "Invalid .attrs: expected an accessor (DfAttrsAccessor or SeriesAttrsAccessor), " + f"got {type(self.attrs).__name__}. A common cause is assigning a dict, e.g. " + "my_dd_object.attrs = {...}. Do not assign to 'attrs'; use " + "my_dd_object.attrs.update(...) instead." ) - if not hasattr(old_accessor._obj, "_attrs"): - old_attrs = {} - else: - old_attrs = old_accessor._obj._attrs.copy() + old_attrs = self.attrs.copy() result = method(self, *args, **kwargs) - # Check if result is a Dask object (has the attrs accessor) vs pandas (plain dict attrs) - if isinstance(result, (dd.DataFrame, dd.Series)): - # Dask DataFrame/Series: initialize _attrs if needed, then assign - result.attrs.update(old_attrs) - # if not hasattr(result, "_attrs"): - # result._attrs = {} - # result._attrs = old_attrs - else: - # Pandas DataFrame/Series: assign to attrs (which is a plain dict attribute) - result.attrs = old_attrs + result.attrs.update(old_attrs) return result return wrapper From 66a6095f2d1807ba0b1b6d7ad6b08a3d372e963d Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 17:08:30 +0100 Subject: [PATCH 19/53] revert after loc/iloc indexer --- src/spatialdata/_core/operations/transform.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/spatialdata/_core/operations/transform.py b/src/spatialdata/_core/operations/transform.py index f2ed03f89..cca2a5238 100644 --- a/src/spatialdata/_core/operations/transform.py +++ b/src/spatialdata/_core/operations/transform.py @@ -8,7 +8,6 @@ import dask.array as da import dask_image.ndinterp import numpy as np -import pandas as pd from dask.array.core import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -442,11 +441,7 @@ def _( for ax in axes: indices = xtransformed["dim"] == ax new_ax = xtransformed[:, indices] - # TODO: discuss with dask team - # This is not nice, but otherwise there is a problem with the joint graph of new_ax and transformed, causing - # a getattr missing dependency of dependent from_dask_array. - new_col = pd.Series(new_ax.data.flatten().compute(), index=transformed.index) - transformed[ax] = new_col + transformed[ax] = new_ax.data.flatten() old_transformations = get_transformation(data, get_all=True) assert isinstance(old_transformations, dict) From 2c81509da15f815eb7ceafb2a7b55c19e19a366f Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 17:17:47 +0100 Subject: [PATCH 20/53] clean-up, simplify accessor logic --- src/spatialdata/__init__.py | 10 +--------- src/spatialdata/_io/accessor.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/src/spatialdata/__init__.py b/src/spatialdata/__init__.py index febe904f6..42fae49f4 100644 --- a/src/spatialdata/__init__.py +++ b/src/spatialdata/__init__.py @@ -1,11 +1,6 @@ from importlib.metadata import version -from spatialdata._io.accessor import ( - DfAttrsAccessor, - SeriesAttrsAccessor, - wrap_indexer_with_attrs, - wrap_with_attrs, -) +import spatialdata._io.accessor # noqa: F401 __version__ = version("spatialdata") @@ -75,6 +70,3 @@ from spatialdata._io.format import SpatialDataFormatType from spatialdata._io.io_zarr import read_zarr from spatialdata._utils import get_pyramid_levels, unpad_raster - -wrap_indexer_with_attrs("loc") -wrap_indexer_with_attrs("iloc") diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index ec3c66107..f9e4b2369 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -1,5 +1,5 @@ -from collections.abc import Callable, Iterator, MutableMapping -from typing import Any, Literal +from collections.abc import Iterator, MutableMapping +from typing import Any, Literal, cast import dask.dataframe as dd from dask.dataframe.extensions import ( @@ -60,12 +60,13 @@ class SeriesAttrsAccessor(_AttrsBase): pass -def wrap_with_attrs(method: Callable[..., Any]) -> Callable[..., Any]: +def wrap_with_attrs(method_name: str) -> None: """Wrap a Dask DataFrame method to preserve _attrs. Copies _attrs from self before calling method, then assigns to result. Safe for lazy operations like set_index, assign, map_partitions. """ + original_method = getattr(dd.DataFrame, method_name) def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: if not isinstance(self.attrs, DfAttrsAccessor | SeriesAttrsAccessor): @@ -77,11 +78,11 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: ) old_attrs = self.attrs.copy() - result = method(self, *args, **kwargs) + result = original_method(self, *args, **kwargs) result.attrs.update(old_attrs) return result - return wrapper + setattr(dd.DataFrame, method_name, wrapper) def wrap_indexer_with_attrs(indexer_name: Literal["loc", "iloc"]) -> None: @@ -120,17 +121,16 @@ def __repr__(self) -> str: setattr(dd.DataFrame, indexer_name, property(indexer_with_attrs)) -methods_to_wrap = [ +for method_name in [ "set_index", "compute", "drop", "__getitem__", "copy", - "cat", "map_partitions", -] +]: + wrap_with_attrs(method_name) -for method_name in methods_to_wrap: - if hasattr(dd.DataFrame, method_name): - original_method = getattr(dd.DataFrame, method_name) - setattr(dd.DataFrame, method_name, wrap_with_attrs(original_method)) + +for indexer_name in ["loc", "iloc"]: + wrap_indexer_with_attrs(cast(Literal["loc", "iloc"], indexer_name)) From 47670daf6baa9de756c095ed569923bbd735770e Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 17:18:38 +0100 Subject: [PATCH 21/53] remove asserts --- src/spatialdata/_core/operations/transform.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/spatialdata/_core/operations/transform.py b/src/spatialdata/_core/operations/transform.py index cca2a5238..a77711b1f 100644 --- a/src/spatialdata/_core/operations/transform.py +++ b/src/spatialdata/_core/operations/transform.py @@ -434,10 +434,7 @@ def _( # dummy transformation that will be replaced by _adjust_transformation() default_cs = {DEFAULT_COORDINATE_SYSTEM: Identity()} transformed.attrs[TRANSFORM_KEY] = default_cs - # TODO: the following line, used in place of the line before, leads to an incorrect aggregation result. Look into - # this! Reported here: ... - # transformed.attrs = {TRANSFORM_KEY: {DEFAULT_COORDINATE_SYSTEM: Identity()}} - assert isinstance(transformed, DaskDataFrame) + for ax in axes: indices = xtransformed["dim"] == ax new_ax = xtransformed[:, indices] From e3c8bc869c3157f73d7800a8c189efee5dae16b2 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 17:23:28 +0100 Subject: [PATCH 22/53] remove asserts --- src/spatialdata/_core/operations/transform.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/spatialdata/_core/operations/transform.py b/src/spatialdata/_core/operations/transform.py index a77711b1f..27c3452c2 100644 --- a/src/spatialdata/_core/operations/transform.py +++ b/src/spatialdata/_core/operations/transform.py @@ -3,7 +3,7 @@ import itertools import warnings from functools import singledispatch -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, cast import dask.array as da import dask_image.ndinterp @@ -440,8 +440,8 @@ def _( new_ax = xtransformed[:, indices] transformed[ax] = new_ax.data.flatten() - old_transformations = get_transformation(data, get_all=True) - assert isinstance(old_transformations, dict) + old_transformations = cast(dict[str, Any], get_transformation(data, get_all=True)) + _set_transformation_for_transformed_elements( transformed, old_transformations, From d43753cd1e7152e44e70f04fa416f55600a5e4c7 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 17:26:21 +0100 Subject: [PATCH 23/53] simplify accessor logic by reducing number of classes --- src/spatialdata/_io/accessor.py | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index f9e4b2369..a04ecff22 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -8,8 +8,10 @@ ) -class _AttrsBase(MutableMapping[str, str | dict[str, str]]): - """Base accessor that stores arbitrary metadata on Dask objects.""" +@register_dataframe_accessor("attrs") +@register_series_accessor("attrs") +class AttrsAccessor(MutableMapping[str, str | dict[str, Any]]): + """Accessor that stores a dict of arbitrary metadata on Dask objects.""" def __init__(self, dask_obj: dd.DataFrame | dd.Series): self._obj = dask_obj @@ -19,7 +21,7 @@ def __init__(self, dask_obj: dd.DataFrame | dd.Series): def __getitem__(self, key: str) -> Any: return self._obj._attrs[key] - def __setitem__(self, key: str, value: str | dict[str, str]) -> None: + def __setitem__(self, key: str, value: str | dict[str, Any]) -> None: self._obj._attrs[key] = value def __delitem__(self, key: str) -> None: @@ -46,20 +48,6 @@ def data(self) -> Any: return self._obj._attrs -@register_dataframe_accessor("attrs") -class DfAttrsAccessor(_AttrsBase): - """Dict-like .attrs accessor for Dask DataFrames.""" - - pass - - -@register_series_accessor("attrs") -class SeriesAttrsAccessor(_AttrsBase): - """Dict-like .attrs accessor for Dask Series.""" - - pass - - def wrap_with_attrs(method_name: str) -> None: """Wrap a Dask DataFrame method to preserve _attrs. @@ -69,9 +57,9 @@ def wrap_with_attrs(method_name: str) -> None: original_method = getattr(dd.DataFrame, method_name) def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: - if not isinstance(self.attrs, DfAttrsAccessor | SeriesAttrsAccessor): + if not isinstance(self.attrs, AttrsAccessor): raise RuntimeError( - "Invalid .attrs: expected an accessor (DfAttrsAccessor or SeriesAttrsAccessor), " + "Invalid .attrs: expected an accessor (`AttrsAccessor`), " f"got {type(self.attrs).__name__}. A common cause is assigning a dict, e.g. " "my_dd_object.attrs = {...}. Do not assign to 'attrs'; use " "my_dd_object.attrs.update(...) instead." From e0ab1d86cef140aa409c3d9334c67ac49d34cb93 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 17:30:02 +0100 Subject: [PATCH 24/53] rename wrap_with_attrs --- src/spatialdata/_io/accessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index a04ecff22..a702df0d2 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -48,7 +48,7 @@ def data(self) -> Any: return self._obj._attrs -def wrap_with_attrs(method_name: str) -> None: +def wrap_method_with_attrs(method_name: str) -> None: """Wrap a Dask DataFrame method to preserve _attrs. Copies _attrs from self before calling method, then assigns to result. @@ -117,7 +117,7 @@ def __repr__(self) -> str: "copy", "map_partitions", ]: - wrap_with_attrs(method_name) + wrap_method_with_attrs(method_name) for indexer_name in ["loc", "iloc"]: From 4257183344003b9f2a7840d6265f1f2385ccdf1d Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 17:36:48 +0100 Subject: [PATCH 25/53] remove comment --- src/spatialdata/transformations/_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/spatialdata/transformations/_utils.py b/src/spatialdata/transformations/_utils.py index 654d2f8b1..6d3b2c1a4 100644 --- a/src/spatialdata/transformations/_utils.py +++ b/src/spatialdata/transformations/_utils.py @@ -54,7 +54,6 @@ def _set_transformations_to_element(element: Any, transformations: MappingToCoor if TRANSFORM_KEY not in attrs: attrs[TRANSFORM_KEY] = {} attrs[TRANSFORM_KEY] = transformations - # this calls an eventual setter in the element class; modifying the attrs directly would not trigger the setter @singledispatch From e49a5804cbf001df9a47bbd3054b5a81804cad64 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 18:30:52 +0100 Subject: [PATCH 26/53] wrapping methods for dd.Series --- src/spatialdata/_io/accessor.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index a702df0d2..0bcdb69a3 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -48,13 +48,13 @@ def data(self) -> Any: return self._obj._attrs -def wrap_method_with_attrs(method_name: str) -> None: +def wrap_method_with_attrs(method_name: str, dask_class: type[dd.DataFrame] | type[dd.Series]) -> None: """Wrap a Dask DataFrame method to preserve _attrs. Copies _attrs from self before calling method, then assigns to result. Safe for lazy operations like set_index, assign, map_partitions. """ - original_method = getattr(dd.DataFrame, method_name) + original_method = getattr(dask_class, method_name) def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: if not isinstance(self.attrs, AttrsAccessor): @@ -70,7 +70,7 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: result.attrs.update(old_attrs) return result - setattr(dd.DataFrame, method_name, wrapper) + setattr(dask_class, method_name, wrapper) def wrap_indexer_with_attrs(indexer_name: Literal["loc", "iloc"]) -> None: @@ -117,8 +117,15 @@ def __repr__(self) -> str: "copy", "map_partitions", ]: - wrap_method_with_attrs(method_name) + wrap_method_with_attrs(method_name=method_name, dask_class=dd.DataFrame) +for method_name in [ + "compute", + "__getitem__", + "copy", + "map_partitions", +]: + wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) for indexer_name in ["loc", "iloc"]: wrap_indexer_with_attrs(cast(Literal["loc", "iloc"], indexer_name)) From ed6b45725952fe8f01b6ac5eab771f46af4ed950 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 18:58:38 +0100 Subject: [PATCH 27/53] add dask tests for accessor --- src/spatialdata/_io/accessor.py | 20 ++-- tests/models/test_accessor.py | 193 ++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 10 deletions(-) create mode 100644 tests/models/test_accessor.py diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 0bcdb69a3..e6f771d35 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -110,22 +110,22 @@ def __repr__(self) -> str: for method_name in [ - "set_index", - "compute", - "drop", "__getitem__", + "compute", "copy", + "drop", "map_partitions", + "set_index", ]: wrap_method_with_attrs(method_name=method_name, dask_class=dd.DataFrame) -for method_name in [ - "compute", - "__getitem__", - "copy", - "map_partitions", -]: - wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) +# for method_name in [ +# "__getitem__", +# "compute", +# "copy", +# "map_partitions", +# ]: +# wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) for indexer_name in ["loc", "iloc"]: wrap_indexer_with_attrs(cast(Literal["loc", "iloc"], indexer_name)) diff --git a/tests/models/test_accessor.py b/tests/models/test_accessor.py new file mode 100644 index 000000000..20dbd3bcf --- /dev/null +++ b/tests/models/test_accessor.py @@ -0,0 +1,193 @@ +import dask.dataframe as dd +import pandas as pd +import pytest + +from spatialdata._io.accessor import AttrsAccessor + +# ============================================================================ +# General tests +# ============================================================================ + + +def test_dataframe_attrs_is_accessor(): + """Test that DataFrame.attrs is an AttrsAccessor, not a dict.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=2) + assert isinstance(df.attrs, AttrsAccessor) + + +def test_series_attrs_is_accessor(): + """Test that Series.attrs is an AttrsAccessor, not a dict.""" + s = dd.from_pandas(pd.Series([1, 2, 3], name="test"), npartitions=2) + assert isinstance(s.attrs, AttrsAccessor) + + +def test_attrs_setitem_getitem(): + """Test setting and getting attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=2) + df.attrs["key"] = "value" + assert df.attrs["key"] == "value" + + +def test_attrs_update(): + """Test that attrs.update() works.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3]}), npartitions=2) + df.attrs.update({"key1": "value1", "key2": "value2"}) + assert df.attrs["key1"] == "value1" + assert df.attrs["key2"] == "value2" + + +def test_invalid_attrs_assignment_raises(): + """Test that assigning a dict to attrs raises an error on next operation.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + + # This is the wrong way to do it + df.attrs = {"key": "value"} + + # Should raise RuntimeError on next wrapped operation + with pytest.raises(RuntimeError, match="Invalid .attrs.*expected an accessor"): + df.set_index("a") + + +def test_chained_operations(): + """Test that attrs survive chained operations.""" + df = dd.from_pandas( + pd.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8], "c": [9, 10, 11, 12]}), + npartitions=2, + ) + df.attrs["experiment"] = "test123" + + result = df.set_index("a").drop("c", axis=1)[["b"]].copy() + + assert result.attrs["experiment"] == "test123" + assert isinstance(result.attrs, AttrsAccessor) + + +# ============================================================================ +# DataFrame wrapped methods tests +# ============================================================================ + + +def test_dataframe_getitem_preserves_attrs(): + """Test that DataFrame.__getitem__ preserves attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + df.attrs["key"] = "value" + + # Single column (returns Series) + result = df["a"] + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + # Multiple columns (returns DataFrame) + result = df[["a", "b"]] + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +def test_dataframe_compute_preserves_attrs(): + """Test that DataFrame.compute preserves attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + df.attrs["key"] = "value" + result = df.compute() + # compute returns a pandas DataFrame, which has attrs as a dict + assert result.attrs["key"] == "value" + + +def test_dataframe_copy_preserves_attrs(): + """Test that DataFrame.copy preserves attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + df.attrs["key"] = "value" + result = df.copy() + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +def test_dataframe_drop_preserves_attrs(): + """Test that DataFrame.drop preserves attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + df.attrs["key"] = "value" + result = df.drop("b", axis=1) + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +def test_dataframe_map_partitions_preserves_attrs(): + """Test that DataFrame.map_partitions preserves attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + df.attrs["key"] = "value" + result = df.map_partitions(lambda x: x * 2) + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +def test_dataframe_set_index_preserves_attrs(): + """Test that DataFrame.set_index preserves attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + df.attrs["key"] = "value" + result = df.set_index("a") + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +# ============================================================================ +# Series wrapped methods tests +# ============================================================================ + + +def test_series_getitem_preserves_attrs(): + """Test that Series.__getitem__ preserves attrs.""" + s = dd.from_pandas(pd.Series([1, 2, 3, 4, 5], name="test"), npartitions=2) + s.attrs["key"] = "value" + result = s[1:3] + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +def test_series_compute_preserves_attrs(): + """Test that Series.compute preserves attrs.""" + s = dd.from_pandas(pd.Series([1, 2, 3], name="test"), npartitions=2) + s.attrs["key"] = "value" + result = s.compute() + # compute returns a pandas Series, which has attrs as a dict + assert result.attrs["key"] == "value" + + +def test_series_copy_preserves_attrs(): + """Test that Series.copy preserves attrs.""" + s = dd.from_pandas(pd.Series([1, 2, 3], name="test"), npartitions=2) + s.attrs["key"] = "value" + result = s.copy() + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +def test_series_map_partitions_preserves_attrs(): + """Test that Series.map_partitions preserves attrs.""" + s = dd.from_pandas(pd.Series([1, 2, 3], name="test"), npartitions=2) + s.attrs["key"] = "value" + result = s.map_partitions(lambda x: x * 2) + assert result.attrs["key"] == "value" + assert isinstance(result.attrs, AttrsAccessor) + + +# ============================================================================ +# Indexer tests +# ============================================================================ + + +def test_dataframe_loc_preserves_attrs(): + """Test that DataFrame.loc preserves attrs.""" + df = dd.from_pandas( + pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, index=[10, 20, 30]), + npartitions=2, + ) + df.attrs["key"] = "value" + result = df.loc[10:20] + assert result.attrs["key"] == "value" + + +def test_dataframe_iloc_preserves_attrs(): + """Test that DataFrame.iloc preserves attrs.""" + df = dd.from_pandas(pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}), npartitions=2) + df.attrs["key"] = "value" + result = df.iloc[:, 0:1] + assert result.attrs["key"] == "value" From 9763016cc8b88706df7b8992c8b43dadc2dd3b43 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 19:20:02 +0100 Subject: [PATCH 28/53] fix index.compute() attrs missing --- src/spatialdata/_io/accessor.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index e6f771d35..9f43c33ce 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -2,6 +2,7 @@ from typing import Any, Literal, cast import dask.dataframe as dd +import pandas as pd from dask.dataframe.extensions import ( register_dataframe_accessor, register_series_accessor, @@ -67,7 +68,10 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: old_attrs = self.attrs.copy() result = original_method(self, *args, **kwargs) - result.attrs.update(old_attrs) + # the pandas Index do not have attrs, but dd.Index, since they are a subclass of dd.Series, do have attrs + # thanks to our accessor. Here we ensure that we do not assign attrs to pd.Index objects. + if not isinstance(result, pd.Index): + result.attrs.update(old_attrs) return result setattr(dask_class, method_name, wrapper) @@ -119,13 +123,13 @@ def __repr__(self) -> str: ]: wrap_method_with_attrs(method_name=method_name, dask_class=dd.DataFrame) -# for method_name in [ -# "__getitem__", -# "compute", -# "copy", -# "map_partitions", -# ]: -# wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) +for method_name in [ + "__getitem__", + "compute", + "copy", + "map_partitions", +]: + wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) for indexer_name in ["loc", "iloc"]: wrap_indexer_with_attrs(cast(Literal["loc", "iloc"], indexer_name)) From 0415c89355968bb359ccd3b07c7b8e349059a008 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 19:28:45 +0100 Subject: [PATCH 29/53] change fix .attrs on index --- src/spatialdata/_io/accessor.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index e6f771d35..dfd4a773d 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -67,7 +67,11 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: old_attrs = self.attrs.copy() result = original_method(self, *args, **kwargs) - result.attrs.update(old_attrs) + # the pandas Index do not have attrs, but dd.Index, since they are a subclass of dd.Series, do have attrs + # thanks to our accessor. Here we ensure that we do not assign attrs to pd.Index objects. + if hasattr(result, "attrs"): + result.attrs.update(old_attrs) + return result setattr(dask_class, method_name, wrapper) @@ -119,13 +123,13 @@ def __repr__(self) -> str: ]: wrap_method_with_attrs(method_name=method_name, dask_class=dd.DataFrame) -# for method_name in [ -# "__getitem__", -# "compute", -# "copy", -# "map_partitions", -# ]: -# wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) +for method_name in [ + "__getitem__", + "compute", + "copy", + "map_partitions", +]: + wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) for indexer_name in ["loc", "iloc"]: wrap_indexer_with_attrs(cast(Literal["loc", "iloc"], indexer_name)) From dc40fde11724c7629e879a30756fede0608d8317 Mon Sep 17 00:00:00 2001 From: Luca Marconato Date: Sun, 2 Nov 2025 19:29:55 +0100 Subject: [PATCH 30/53] wrap dd.Series.loc --- src/spatialdata/_io/accessor.py | 46 ++++++++++++++++++--------------- tests/models/test_accessor.py | 14 ++++++++++ 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/_io/accessor.py index 9f43c33ce..1a1e96739 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/_io/accessor.py @@ -1,5 +1,5 @@ from collections.abc import Iterator, MutableMapping -from typing import Any, Literal, cast +from typing import Any, Literal import dask.dataframe as dd import pandas as pd @@ -77,40 +77,42 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: setattr(dask_class, method_name, wrapper) -def wrap_indexer_with_attrs(indexer_name: Literal["loc", "iloc"]) -> None: - """Patch dd.DataFrame.loc or iloc to preserve _attrs. +def wrap_indexer_with_attrs( + indexer_name: Literal["loc", "iloc"], dask_class: type[dd.DataFrame] | type[dd.Series] +) -> None: + """Patch dd.DataFrame or dd.Series loc or iloc to preserve _attrs. Reason for having this separate from methods is because both loc and iloc are a property that return an indexer. Therefore, they have to be wrapped differently from methods in order to preserve attrs. """ - original_property = getattr(dd.DataFrame, indexer_name) # this is a property + original_property = getattr(dask_class, indexer_name) # this is a property - def indexer_with_attrs(self: dd.DataFrame) -> Any: - df = self - loc = original_property.fget(df) + def indexer_with_attrs(self: dd.DataFrame | dd.Series) -> Any: + parent_obj = self + indexer = original_property.fget(parent_obj) class IndexerWrapper: - def __init__(self, parent_loc: Any, parent_df: dd.DataFrame) -> None: - self._parent_loc = parent_loc - self._parent_df = parent_df + def __init__(self, parent_indexer: Any, parent_obj: dd.DataFrame | dd.Series) -> None: + self._parent_indexer = parent_indexer + self._parent_obj = parent_obj def __getitem__(self, key: str) -> Any: - result = self._parent_loc[key] - if hasattr(self._parent_df, "attrs"): - result._attrs = self._parent_df.attrs.copy() + result = self._parent_indexer[key] + if hasattr(self._parent_obj, "attrs"): + result._attrs = self._parent_obj.attrs.copy() return result - def __setitem__(self, key: str, value: Any) -> dd.DataFrame: + def __setitem__(self, key: str, value: Any) -> dd.DataFrame | dd.Series: # preserve attrs even if user assigns via .loc - self._parent_loc[key] = value - return self._parent_df + self._parent_indexer[key] = value + return self._parent_obj def __repr__(self) -> str: - return repr(self._parent_loc) + return repr(self._parent_indexer) - return IndexerWrapper(loc, df) + return IndexerWrapper(indexer, parent_obj) - setattr(dd.DataFrame, indexer_name, property(indexer_with_attrs)) + setattr(dask_class, indexer_name, property(indexer_with_attrs)) for method_name in [ @@ -131,5 +133,7 @@ def __repr__(self) -> str: ]: wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) -for indexer_name in ["loc", "iloc"]: - wrap_indexer_with_attrs(cast(Literal["loc", "iloc"], indexer_name)) +wrap_indexer_with_attrs(indexer_name="loc", dask_class=dd.DataFrame) +wrap_indexer_with_attrs(indexer_name="iloc", dask_class=dd.DataFrame) +wrap_indexer_with_attrs(indexer_name="loc", dask_class=dd.Series) +# dd.Series do not have iloc diff --git a/tests/models/test_accessor.py b/tests/models/test_accessor.py index 20dbd3bcf..f550d94b9 100644 --- a/tests/models/test_accessor.py +++ b/tests/models/test_accessor.py @@ -191,3 +191,17 @@ def test_dataframe_iloc_preserves_attrs(): df.attrs["key"] = "value" result = df.iloc[:, 0:1] assert result.attrs["key"] == "value" + + +def test_series_loc_preserves_attrs(): + """Test that Series.loc preserves attrs.""" + s = dd.from_pandas( + pd.Series([1, 2, 3, 4, 5], index=[10, 20, 30, 40, 50], name="test"), + npartitions=2, + ) + s.attrs["key"] = "value" + result = s.loc[10:30] + assert result.attrs["key"] == "value" + + +# dd.Series do not have .iloc, hence there is no test_series_iloc_preserves_attrs() test From 3770407599054b7ea084e399817af55dc19c5495 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 20:49:04 +0100 Subject: [PATCH 31/53] remove old code, add comments --- src/spatialdata/_io/_utils.py | 16 +++++----------- tests/core/query/test_spatial_query.py | 1 - tests/io/test_utils.py | 2 ++ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 68a1a02de..281f0b040 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -327,17 +327,8 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No path = getattr(v.store, "path", None) if getattr(v.store, "path", None) else v.store.root files.append(str(UPath(path).resolve())) elif name.startswith("read-parquet") or name.startswith("read_parquet"): - if hasattr(v, "creation_info"): - # https://github.com/dask/dask/blob/ff2488aec44d641696e0b7aa41ed9e995c710705/dask/dataframe/io/parquet/core.py#L625 - t = v.creation_info["args"] - if not isinstance(t, tuple) or len(t) != 1: - raise ValueError( - f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " - f"report this bug." - ) - parquet_file = t[0] - files.append(str(UPath(parquet_file).resolve())) - elif "piece" in v.args[0]: + # Here v is a read_parquet task with arguments. + if "piece" in v.args[0]: # https://github.com/dask/dask/blob/ff2488aec44d641696e0b7aa41ed9e995c710705/dask/dataframe/io/parquet/core.py#L870 parquet_file, check0, check1 = v.args[0]["piece"] if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: @@ -347,6 +338,9 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No ) files.append(os.path.realpath(parquet_file)) else: + # This occurs when for example points and images are mixed, the main task still starts with + # read_parquet, but the execution happens through a subgraph which we iterate over to get the + # actual read_parquet task. for task in v.args[0].value: if isinstance(task.args[0], dict) and "piece" in task.args[0]: parquet_file, check0, check1 = task.args[0]["piece"] diff --git a/tests/core/query/test_spatial_query.py b/tests/core/query/test_spatial_query.py index 77843f74e..d7147dbfb 100644 --- a/tests/core/query/test_spatial_query.py +++ b/tests/core/query/test_spatial_query.py @@ -651,7 +651,6 @@ def test_query_points_multiple_partitions(points, with_polygon_query: bool): attrs = p0.attrs.copy() ddf = dd.from_pandas(p0.compute(), npartitions=10) ddf.attrs.update(attrs) - # ddf._attrs = attrs p1 = PointsModel.parse(ddf) def _query(p: DaskDataFrame) -> DaskDataFrame: diff --git a/tests/io/test_utils.py b/tests/io/test_utils.py index 0a430704f..d6e95604e 100644 --- a/tests/io/test_utils.py +++ b/tests/io/test_utils.py @@ -107,6 +107,8 @@ def test_backing_files_combining_points_and_images(points, images): images1 = read_zarr(f1) p0 = points0.points["points_0"] + ddf = dd.from_pandas(p0.compute(), npartitions=10) + get_dask_backing_files(ddf) im1 = images1.images["image2d"] v = p0["x"].loc[0].values v.compute_chunk_sizes() From f46e2255381c79ed37075093ba5d4fc1f28a706f Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 21:24:46 +0100 Subject: [PATCH 32/53] move accesor code --- src/spatialdata/__init__.py | 2 +- .../{_io/accessor.py => models/_accessor.py} | 27 ++++++++++--------- tests/models/test_accessor.py | 2 +- 3 files changed, 16 insertions(+), 15 deletions(-) rename src/spatialdata/{_io/accessor.py => models/_accessor.py} (80%) diff --git a/src/spatialdata/__init__.py b/src/spatialdata/__init__.py index 42fae49f4..2fb483505 100644 --- a/src/spatialdata/__init__.py +++ b/src/spatialdata/__init__.py @@ -1,6 +1,6 @@ from importlib.metadata import version -import spatialdata._io.accessor # noqa: F401 +import spatialdata.models._accessor # noqa: F401 __version__ = version("spatialdata") diff --git a/src/spatialdata/_io/accessor.py b/src/spatialdata/models/_accessor.py similarity index 80% rename from src/spatialdata/_io/accessor.py rename to src/spatialdata/models/_accessor.py index 585fdfa1f..a8b19653d 100644 --- a/src/spatialdata/_io/accessor.py +++ b/src/spatialdata/models/_accessor.py @@ -1,7 +1,8 @@ from collections.abc import Iterator, MutableMapping from typing import Any, Literal -import dask.dataframe as dd +from dask.dataframe import DataFrame as DaskDataFrame +from dask.dataframe import Series as DaskSeries from dask.dataframe.extensions import ( register_dataframe_accessor, register_series_accessor, @@ -13,7 +14,7 @@ class AttrsAccessor(MutableMapping[str, str | dict[str, Any]]): """Accessor that stores a dict of arbitrary metadata on Dask objects.""" - def __init__(self, dask_obj: dd.DataFrame | dd.Series): + def __init__(self, dask_obj: DaskDataFrame | DaskSeries): self._obj = dask_obj if not hasattr(dask_obj, "_attrs"): dask_obj._attrs = {} @@ -48,7 +49,7 @@ def data(self) -> Any: return self._obj._attrs -def wrap_method_with_attrs(method_name: str, dask_class: type[dd.DataFrame] | type[dd.Series]) -> None: +def wrap_method_with_attrs(method_name: str, dask_class: type[DaskDataFrame] | type[DaskSeries]) -> None: """Wrap a Dask DataFrame method to preserve _attrs. Copies _attrs from self before calling method, then assigns to result. @@ -78,7 +79,7 @@ def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: def wrap_indexer_with_attrs( - indexer_name: Literal["loc", "iloc"], dask_class: type[dd.DataFrame] | type[dd.Series] + indexer_name: Literal["loc", "iloc"], dask_class: type[DaskDataFrame] | type[DaskSeries] ) -> None: """Patch dd.DataFrame or dd.Series loc or iloc to preserve _attrs. @@ -87,12 +88,12 @@ def wrap_indexer_with_attrs( """ original_property = getattr(dask_class, indexer_name) # this is a property - def indexer_with_attrs(self: dd.DataFrame | dd.Series) -> Any: + def indexer_with_attrs(self: DaskDataFrame | DaskSeries) -> Any: parent_obj = self indexer = original_property.fget(parent_obj) class IndexerWrapper: - def __init__(self, parent_indexer: Any, parent_obj: dd.DataFrame | dd.Series) -> None: + def __init__(self, parent_indexer: Any, parent_obj: DaskDataFrame | DaskSeries) -> None: self._parent_indexer = parent_indexer self._parent_obj = parent_obj @@ -102,7 +103,7 @@ def __getitem__(self, key: str) -> Any: result._attrs = self._parent_obj.attrs.copy() return result - def __setitem__(self, key: str, value: Any) -> dd.DataFrame | dd.Series: + def __setitem__(self, key: str, value: Any) -> DaskDataFrame | DaskSeries: # preserve attrs even if user assigns via .loc self._parent_indexer[key] = value return self._parent_obj @@ -123,7 +124,7 @@ def __repr__(self) -> str: "map_partitions", "set_index", ]: - wrap_method_with_attrs(method_name=method_name, dask_class=dd.DataFrame) + wrap_method_with_attrs(method_name=method_name, dask_class=DaskDataFrame) for method_name in [ "__getitem__", @@ -131,9 +132,9 @@ def __repr__(self) -> str: "copy", "map_partitions", ]: - wrap_method_with_attrs(method_name=method_name, dask_class=dd.Series) + wrap_method_with_attrs(method_name=method_name, dask_class=DaskSeries) -wrap_indexer_with_attrs(indexer_name="loc", dask_class=dd.DataFrame) -wrap_indexer_with_attrs(indexer_name="iloc", dask_class=dd.DataFrame) -wrap_indexer_with_attrs(indexer_name="loc", dask_class=dd.Series) -# dd.Series do not have iloc +wrap_indexer_with_attrs(indexer_name="loc", dask_class=DaskDataFrame) +wrap_indexer_with_attrs(indexer_name="iloc", dask_class=DaskDataFrame) +wrap_indexer_with_attrs(indexer_name="loc", dask_class=DaskSeries) +# DaskSeries do not have iloc diff --git a/tests/models/test_accessor.py b/tests/models/test_accessor.py index f550d94b9..7356f52cc 100644 --- a/tests/models/test_accessor.py +++ b/tests/models/test_accessor.py @@ -2,7 +2,7 @@ import pandas as pd import pytest -from spatialdata._io.accessor import AttrsAccessor +from spatialdata.models._accessor import AttrsAccessor # ============================================================================ # General tests From f76672e9f95e769c6401f27378d0614977e00716 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 21:38:53 +0100 Subject: [PATCH 33/53] change git workflow --- .github/workflows/test.yaml | 16 +++++++++++++++- pyproject.toml | 2 +- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1003fddf1..00c284a5a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -23,6 +23,12 @@ jobs: include: - os: macos-latest python: "3.11" + dask-version: "2025.1.0" + name: "macOS Python 3.11 (Dask 2025.1.0)" + - os: macos-latest + python: "3.11" + dask-version: "latest" + name: "macOS Python 3.11 (Dask latest)" - os: macos-latest python: "3.12" pip-flags: "--pre" @@ -42,7 +48,15 @@ jobs: version: "latest" python-version: ${{ matrix.python }} - name: Install dependencies - run: "uv sync --extra test" + run: | + uv sync --extra test + if [[ "${{ matrix.os }}" == "macos-latest" && -n "${{ matrix.dask-version }}" ]]; then + if [[ "${{ matrix.dask-version }}" == "latest" ]]; then + uv pip install --upgrade dask + else + uv pip install dask==${{ matrix.dask-version }} + fi + fi - name: Test env: MPLBACKEND: agg diff --git a/pyproject.toml b/pyproject.toml index 766f75152..7816236ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "anndata>=0.9.1", "click", "dask-image", - "dask>=2024.10.0,<=2024.11.2", + "dask>=2025.1.0", "datashader", "fsspec[s3,http]", "geopandas>=0.14", From f68d55d8a57f3bc24a1718d302f940007c9119ed Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 23:25:01 +0100 Subject: [PATCH 34/53] some fixes --- src/spatialdata/_core/operations/transform.py | 7 ++++++- tests/io/test_pyramids_performance.py | 6 +++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/spatialdata/_core/operations/transform.py b/src/spatialdata/_core/operations/transform.py index 27c3452c2..8340f23e7 100644 --- a/src/spatialdata/_core/operations/transform.py +++ b/src/spatialdata/_core/operations/transform.py @@ -8,6 +8,7 @@ import dask.array as da import dask_image.ndinterp import numpy as np +import pandas as pd from dask.array.core import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -438,7 +439,11 @@ def _( for ax in axes: indices = xtransformed["dim"] == ax new_ax = xtransformed[:, indices] - transformed[ax] = new_ax.data.flatten() + # TODO: discuss with dask team + # This is not nice, but otherwise there is a problem with the joint graph of new_ax and transformed, causing + # a getattr missing dependency of dependent from_dask_array. + new_col = pd.Series(new_ax.data.flatten().compute(), index=transformed.index) + transformed[ax] = new_col old_transformations = cast(dict[str, Any], get_transformation(data, get_all=True)) diff --git a/tests/io/test_pyramids_performance.py b/tests/io/test_pyramids_performance.py index 7f234800e..875879541 100644 --- a/tests/io/test_pyramids_performance.py +++ b/tests/io/test_pyramids_performance.py @@ -83,5 +83,9 @@ def test_write_image_multiscale_performance(sdata_with_image: SpatialData, tmp_p actual_num_chunk_writes = zarr_chunk_write_spy.call_count actual_num_chunk_reads = zarr_chunk_read_spy.call_count - assert actual_num_chunk_writes == num_chunks_all_scales.item() + # https://github.com/dask/dask/pull/11736 introduces an extra write of the last chunk when finalizing. + assert actual_num_chunk_writes in { + num_chunks_all_scales.item(), + num_chunks_all_scales.item() + 1, + } assert actual_num_chunk_reads == num_chunks_scale0.item() From 9f26549256394b860acf8a1a5c20e2eea723414d Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sun, 2 Nov 2025 23:52:37 +0100 Subject: [PATCH 35/53] remove old test code --- tests/io/test_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/io/test_utils.py b/tests/io/test_utils.py index d6e95604e..57bfe6e42 100644 --- a/tests/io/test_utils.py +++ b/tests/io/test_utils.py @@ -107,8 +107,7 @@ def test_backing_files_combining_points_and_images(points, images): images1 = read_zarr(f1) p0 = points0.points["points_0"] - ddf = dd.from_pandas(p0.compute(), npartitions=10) - get_dask_backing_files(ddf) + im1 = images1.images["image2d"] v = p0["x"].loc[0].values v.compute_chunk_sizes() From 39dc8fbf7149132616f7ce9ebd0a7977ef5707b2 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 11:00:58 +0100 Subject: [PATCH 36/53] test dask among os --- .github/workflows/test.yaml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 00c284a5a..038d31097 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -29,12 +29,26 @@ jobs: python: "3.11" dask-version: "latest" name: "macOS Python 3.11 (Dask latest)" + - os: ubuntu-latest + python: "3.11" + dask-version: "2025.1.0" + name: "Ubuntu Python 3.11 (Dask 2025.1.0)" + - os: ubuntu-latest + python: "3.11" + dask-version: "latest" + name: "Ubuntu Python 3.11 (Dask latest)" + - os: windows-latest + python: "3.11" + dask-version: "2025.1.0" + name: "Windows Python 3.11 (Dask 2025.1.0)" + - os: windows-latest + python: "3.11" + dask-version: "latest" + name: "Windows Python 3.11 (Dask latest)" - os: macos-latest python: "3.12" pip-flags: "--pre" name: "Python 3.12 (pre-release)" - - os: windows-latest - python: "3.11" env: OS: ${{ matrix.os }} @@ -50,7 +64,7 @@ jobs: - name: Install dependencies run: | uv sync --extra test - if [[ "${{ matrix.os }}" == "macos-latest" && -n "${{ matrix.dask-version }}" ]]; then + if [[ -n "${{ matrix.dask-version }}" ]]; then if [[ "${{ matrix.dask-version }}" == "latest" ]]; then uv pip install --upgrade dask else From 1a8343957cded442ac95eaa6bc1f764c5288abee Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 11:18:59 +0100 Subject: [PATCH 37/53] fix --- .github/workflows/test.yaml | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 038d31097..0dcfc45aa 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,30 +24,31 @@ jobs: - os: macos-latest python: "3.11" dask-version: "2025.1.0" - name: "macOS Python 3.11 (Dask 2025.1.0)" + name: "Dask 2025.1.0" - os: macos-latest python: "3.11" dask-version: "latest" - name: "macOS Python 3.11 (Dask latest)" + name: "Dask latest" - os: ubuntu-latest python: "3.11" dask-version: "2025.1.0" - name: "Ubuntu Python 3.11 (Dask 2025.1.0)" + name: "Dask 2025.1.0" - os: ubuntu-latest python: "3.11" dask-version: "latest" - name: "Ubuntu Python 3.11 (Dask latest)" + name: "Dask latest" - os: windows-latest python: "3.11" dask-version: "2025.1.0" - name: "Windows Python 3.11 (Dask 2025.1.0)" + name: "Dask 2025.1.0" - os: windows-latest python: "3.11" dask-version: "latest" - name: "Windows Python 3.11 (Dask latest)" + name: "Dask latest" - os: macos-latest python: "3.12" pip-flags: "--pre" + dask-version: "latest" name: "Python 3.12 (pre-release)" env: @@ -64,12 +65,10 @@ jobs: - name: Install dependencies run: | uv sync --extra test - if [[ -n "${{ matrix.dask-version }}" ]]; then - if [[ "${{ matrix.dask-version }}" == "latest" ]]; then - uv pip install --upgrade dask - else - uv pip install dask==${{ matrix.dask-version }} - fi + if [[ "${{ matrix.dask-version }}" == "latest" ]]; then + uv pip install --upgrade dask + else + uv pip install dask==${{ matrix.dask-version }} fi - name: Test env: From 18fdb703313b4ad43c4a18125ac12d8867210301 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 11:26:41 +0100 Subject: [PATCH 38/53] fix --- .github/workflows/test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 0dcfc45aa..7f0169258 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -67,7 +67,7 @@ jobs: uv sync --extra test if [[ "${{ matrix.dask-version }}" == "latest" ]]; then uv pip install --upgrade dask - else + elif [[ -n "${{ matrix.dask-version }}" ]]; then uv pip install dask==${{ matrix.dask-version }} fi - name: Test From d43cac11f8a5a1d7075f7e977a3574b7566c1666 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 11:32:13 +0100 Subject: [PATCH 39/53] fix --- .github/workflows/test.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 7f0169258..4aa597612 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -65,10 +65,11 @@ jobs: - name: Install dependencies run: | uv sync --extra test - if [[ "${{ matrix.dask-version }}" == "latest" ]]; then + DASK_VERSION="${{ matrix.dask-version }}" + if [[ "${DASK_VERSION}" == "latest" ]]; then uv pip install --upgrade dask - elif [[ -n "${{ matrix.dask-version }}" ]]; then - uv pip install dask==${{ matrix.dask-version }} + elif [[ -n "${DASK_VERSION}" ]]; then + uv pip install "dask==${DASK_VERSION}" fi - name: Test env: From a78c6809e5739541da8ef388632b962719f15aa9 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 11:34:35 +0100 Subject: [PATCH 40/53] revert changes --- .github/workflows/test.yaml | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 4aa597612..00c284a5a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,32 +24,17 @@ jobs: - os: macos-latest python: "3.11" dask-version: "2025.1.0" - name: "Dask 2025.1.0" + name: "macOS Python 3.11 (Dask 2025.1.0)" - os: macos-latest python: "3.11" dask-version: "latest" - name: "Dask latest" - - os: ubuntu-latest - python: "3.11" - dask-version: "2025.1.0" - name: "Dask 2025.1.0" - - os: ubuntu-latest - python: "3.11" - dask-version: "latest" - name: "Dask latest" - - os: windows-latest - python: "3.11" - dask-version: "2025.1.0" - name: "Dask 2025.1.0" - - os: windows-latest - python: "3.11" - dask-version: "latest" - name: "Dask latest" + name: "macOS Python 3.11 (Dask latest)" - os: macos-latest python: "3.12" pip-flags: "--pre" - dask-version: "latest" name: "Python 3.12 (pre-release)" + - os: windows-latest + python: "3.11" env: OS: ${{ matrix.os }} @@ -65,11 +50,12 @@ jobs: - name: Install dependencies run: | uv sync --extra test - DASK_VERSION="${{ matrix.dask-version }}" - if [[ "${DASK_VERSION}" == "latest" ]]; then - uv pip install --upgrade dask - elif [[ -n "${DASK_VERSION}" ]]; then - uv pip install "dask==${DASK_VERSION}" + if [[ "${{ matrix.os }}" == "macos-latest" && -n "${{ matrix.dask-version }}" ]]; then + if [[ "${{ matrix.dask-version }}" == "latest" ]]; then + uv pip install --upgrade dask + else + uv pip install dask==${{ matrix.dask-version }} + fi fi - name: Test env: From 8d5251b3a0239d1ca53f09ec3643631c637fa9d4 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 11:48:04 +0100 Subject: [PATCH 41/53] fix --- .github/workflows/test.yaml | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 00c284a5a..0b8a6146b 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -20,25 +20,17 @@ jobs: matrix: python: ["3.11", "3.13"] os: [ubuntu-latest] + dask-version: ["2025.1.0", "latest"] include: - - os: macos-latest - python: "3.11" - dask-version: "2025.1.0" - name: "macOS Python 3.11 (Dask 2025.1.0)" - - os: macos-latest - python: "3.11" - dask-version: "latest" - name: "macOS Python 3.11 (Dask latest)" - os: macos-latest python: "3.12" pip-flags: "--pre" name: "Python 3.12 (pre-release)" - - os: windows-latest - python: "3.11" env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} + DASK_VERSION: ${{ matrix.dask-version }} steps: - uses: actions/checkout@v2 @@ -50,11 +42,11 @@ jobs: - name: Install dependencies run: | uv sync --extra test - if [[ "${{ matrix.os }}" == "macos-latest" && -n "${{ matrix.dask-version }}" ]]; then - if [[ "${{ matrix.dask-version }}" == "latest" ]]; then + if [[ -n "${DASK_VERSION}" ]]; then + if [[ "${DASK_VERSION}" == "latest" ]]; then uv pip install --upgrade dask else - uv pip install dask==${{ matrix.dask-version }} + uv pip install dask==${DASK_VERSION} fi fi - name: Test From b9a228a6888b7b690d9f52379260527590cbd984 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 11:53:28 +0100 Subject: [PATCH 42/53] adjust --- .github/workflows/test.yaml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 0b8a6146b..c9a1fcc03 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -20,8 +20,28 @@ jobs: matrix: python: ["3.11", "3.13"] os: [ubuntu-latest] - dask-version: ["2025.1.0", "latest"] + include: + - os: windows-latest + python: "3.11" + dask-version: "2025.1.0" + name: "Dask 2025.1.0" + - os: windows-latest + python: "3.11" + dask-version: "2025.10.0" + name: "Dask 2025.10.0" + - os: ubuntu-latest + python: "3.11" + dask-version: "2025.1.0" + name: "Dask 2025.1.0" + - os: macos-latest + python: "3.11" + dask-version: "2025.1.0" + name: "Dask 2025.1.0" + - os: macos-latest + python: "3.11" + dask-version: "2025.10.0" + name: "Dask 2025.10.0" - os: macos-latest python: "3.12" pip-flags: "--pre" From 7efabfe4aebce887643c77cc7c38d2335fe17e48 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 12:45:35 +0100 Subject: [PATCH 43/53] adjust dask pin --- pyproject.toml | 2 +- src/spatialdata/_core/operations/rasterize.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7816236ab..d1196a9e6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "anndata>=0.9.1", "click", "dask-image", - "dask>=2025.1.0", + "dask>2024.11.2,<=2024.12.1", "datashader", "fsspec[s3,http]", "geopandas>=0.14", diff --git a/src/spatialdata/_core/operations/rasterize.py b/src/spatialdata/_core/operations/rasterize.py index 6da0a7cc8..423cca766 100644 --- a/src/spatialdata/_core/operations/rasterize.py +++ b/src/spatialdata/_core/operations/rasterize.py @@ -585,6 +585,7 @@ def rasterize_images_labels( raise ValueError(f"Unsupported schema {schema}") # resample the image + # TODO results here are vastly different between dask 2025.1.0 and 2025.2.0, from the latter on tests fail. transformed_dask = dask_image.ndinterp.affine_transform( xdata.data, matrix=matrix, From 3ed65bde8252aaaa4f14926b0d16d62f1793725a Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 13:44:18 +0100 Subject: [PATCH 44/53] adjust dask pin --- .github/workflows/test.yaml | 12 ++++++------ pyproject.toml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index c9a1fcc03..94289872a 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -24,20 +24,20 @@ jobs: include: - os: windows-latest python: "3.11" - dask-version: "2025.1.0" - name: "Dask 2025.1.0" + dask-version: "2025.2.0" + name: "Dask 2025.2.0" - os: windows-latest python: "3.11" dask-version: "2025.10.0" name: "Dask 2025.10.0" - os: ubuntu-latest python: "3.11" - dask-version: "2025.1.0" - name: "Dask 2025.1.0" + dask-version: "2025.2.0" + name: "Dask 2025.2.0" - os: macos-latest python: "3.11" - dask-version: "2025.1.0" - name: "Dask 2025.1.0" + dask-version: "2025.2.0" + name: "Dask 2025.2.0" - os: macos-latest python: "3.11" dask-version: "2025.10.0" diff --git a/pyproject.toml b/pyproject.toml index d1196a9e6..1e1c6d426 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "anndata>=0.9.1", "click", "dask-image", - "dask>2024.11.2,<=2024.12.1", + "dask>=2025.2.0", "datashader", "fsspec[s3,http]", "geopandas>=0.14", From 18242962ee5d04bdb1cfb06e0d7d9a7feb84b0cb Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 17:04:14 +0100 Subject: [PATCH 45/53] fix dask backing files and windows permissions --- src/spatialdata/_io/_utils.py | 24 ++++++++++++++++++++---- src/spatialdata/_io/io_raster.py | 4 +++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/spatialdata/_io/_utils.py b/src/spatialdata/_io/_utils.py index 281f0b040..a8e194a7b 100644 --- a/src/spatialdata/_io/_utils.py +++ b/src/spatialdata/_io/_utils.py @@ -14,6 +14,7 @@ import zarr from anndata import AnnData +from dask._task_spec import Task from dask.array import Array as DaskArray from dask.dataframe import DataFrame as DaskDataFrame from geopandas import GeoDataFrame @@ -301,6 +302,19 @@ def _get_backing_files(element: DaskArray | DaskDataFrame) -> list[str]: return files +def _find_piece_dict(obj: dict[str, tuple[str | None]] | Task) -> dict[str, tuple[str | None | None]] | None: + """Recursively search for dict containing the key 'piece' in Dask task specs containing the parquet file path.""" + if isinstance(obj, dict): + if "piece" in obj: + return obj + elif hasattr(obj, "args"): # Handles dask._task_spec.* objects like Task and List + for v in obj.args: + result = _find_piece_dict(v) + if result is not None: + return result + return None + + def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> None: # see the types allowed for the dask graph here: https://docs.dask.org/en/stable/spec.html @@ -327,7 +341,7 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No path = getattr(v.store, "path", None) if getattr(v.store, "path", None) else v.store.root files.append(str(UPath(path).resolve())) elif name.startswith("read-parquet") or name.startswith("read_parquet"): - # Here v is a read_parquet task with arguments. + # Here v is a read_parquet task with arguments and the only value is a dictionary. if "piece" in v.args[0]: # https://github.com/dask/dask/blob/ff2488aec44d641696e0b7aa41ed9e995c710705/dask/dataframe/io/parquet/core.py#L870 parquet_file, check0, check1 = v.args[0]["piece"] @@ -341,9 +355,11 @@ def _search_for_backing_files_recursively(subgraph: Any, files: list[str]) -> No # This occurs when for example points and images are mixed, the main task still starts with # read_parquet, but the execution happens through a subgraph which we iterate over to get the # actual read_parquet task. - for task in v.args[0].value: - if isinstance(task.args[0], dict) and "piece" in task.args[0]: - parquet_file, check0, check1 = task.args[0]["piece"] + for task in v.args[0].values(): + # Recursively go through tasks, this is required because differences between dask versions. + piece_dict = _find_piece_dict(task) + if isinstance(piece_dict, dict) and "piece" in piece_dict: + parquet_file, check0, check1 = piece_dict["piece"] # type: ignore[misc] if not parquet_file.endswith(".parquet") or check0 is not None or check1 is not None: raise ValueError( f"Unable to parse the parquet file from the dask subgraph {subgraph}. Please " diff --git a/src/spatialdata/_io/io_raster.py b/src/spatialdata/_io/io_raster.py index ad2b22274..bc8206db1 100644 --- a/src/spatialdata/_io/io_raster.py +++ b/src/spatialdata/_io/io_raster.py @@ -347,7 +347,9 @@ def _write_raster_datatree( compute=False, ) # Compute all pyramid levels at once to allow Dask to optimize the computational graph. - da.compute(*dask_delayed) + # Optimize_graph is set to False for now as this causes permission denied errors when during atomic writes + # os.replace is called. These can also be alleviated by using 'single-threaded' scheduler. + da.compute(*dask_delayed, optimize_graph=False) trans_group = group["labels"][element_name] if raster_type == "labels" else group overwrite_coordinate_transformations_raster( From 42c2452870d9197a451811e8318a72a7ff8472ee Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 17:25:30 +0100 Subject: [PATCH 46/53] fix dask mixed graph problem --- src/spatialdata/_core/query/spatial_query.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_core/query/spatial_query.py b/src/spatialdata/_core/query/spatial_query.py index f30b41c0e..483f90847 100644 --- a/src/spatialdata/_core/query/spatial_query.py +++ b/src/spatialdata/_core/query/spatial_query.py @@ -679,7 +679,8 @@ def _( if mask.sum() == 0: points_in_intrinsic_bounding_box.append(None) else: - points_in_intrinsic_bounding_box.append(points.loc[mask]) + # TODO there is a problem when mixing dask dataframe graph with dask array graph. Need to compute for now. + points_in_intrinsic_bounding_box.append(points.loc[mask.compute()]) if len(points_in_intrinsic_bounding_box) == 0: return None From 93b48bec4c8fa7ec4494d14edda7a60cf5c86417 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Mon, 3 Nov 2025 21:07:31 +0100 Subject: [PATCH 47/53] temporary fix indexing --- src/spatialdata/_core/query/spatial_query.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/spatialdata/_core/query/spatial_query.py b/src/spatialdata/_core/query/spatial_query.py index 483f90847..36b40748a 100644 --- a/src/spatialdata/_core/query/spatial_query.py +++ b/src/spatialdata/_core/query/spatial_query.py @@ -675,12 +675,21 @@ def _( if not (len_df := len(in_intrinsic_bounding_box)) == (len_bb := len(min_coordinate)): raise ValueError(f"Number of dataframes `{len_df}` is not equal to the number of bounding boxes `{len_bb}`.") points_in_intrinsic_bounding_box: list[DaskDataFrame | None] = [] + points_pd = points.compute() + attrs = points.attrs.copy() for mask in in_intrinsic_bounding_box: if mask.sum() == 0: points_in_intrinsic_bounding_box.append(None) else: # TODO there is a problem when mixing dask dataframe graph with dask array graph. Need to compute for now. - points_in_intrinsic_bounding_box.append(points.loc[mask.compute()]) + # we can't compute either mask or points as when we calculate either one of them + # test_query_points_multiple_partitions will fail as the mask will be used to index each partition. + # However, if we compute and then create the dask array again we get the mixed dask graph problem. + mask_np = mask.compute() + filtered_pd = points_pd[mask_np] + points_filtered = dd.from_pandas(filtered_pd, npartitions=points.npartitions) + points_filtered.attrs.update(attrs) + points_in_intrinsic_bounding_box.append(points_filtered) if len(points_in_intrinsic_bounding_box) == 0: return None From 1813c843bbc8ddec89170b5cbdc5f005a1fdda96 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Tue, 4 Nov 2025 17:37:02 +0100 Subject: [PATCH 48/53] fix rasterize --- tests/core/operations/test_rasterize.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/core/operations/test_rasterize.py b/tests/core/operations/test_rasterize.py index 25f3c3d0f..a2ffde3d4 100644 --- a/tests/core/operations/test_rasterize.py +++ b/tests/core/operations/test_rasterize.py @@ -52,10 +52,10 @@ def _rasterize(element: DataArray | DataTree, element_name: str, **kwargs) -> Da def _get_data_of_largest_scale(raster): if isinstance(raster, DataArray): - return raster.data.compute() + return raster.data xdata = get_pyramid_levels(raster, n=0) - return xdata.data.compute() + return xdata.data for element_name, raster in rasters.items(): dims = get_axes_names(raster) @@ -63,6 +63,9 @@ def _get_data_of_largest_scale(raster): slices = [all_slices[d] for d in dims] data = _get_data_of_largest_scale(raster) + # The line above before returned a numpy array. Setting the indices of the slice to 1 would previously update + # also raster, but since dask 2025.2.0 this does not happen anymore. However, we can just set the slice to 1 + # on the dask array. data[tuple(slices)] = 1 for kwargs in [ From 50374bbc35009d4359007940d4b15d50f9f3b83f Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Tue, 4 Nov 2025 17:40:19 +0100 Subject: [PATCH 49/53] adjust github workflow --- .github/workflows/test.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 94289872a..46d6f6727 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -28,8 +28,8 @@ jobs: name: "Dask 2025.2.0" - os: windows-latest python: "3.11" - dask-version: "2025.10.0" - name: "Dask 2025.10.0" + dask-version: "latest" + name: "Dask latest" - os: ubuntu-latest python: "3.11" dask-version: "2025.2.0" @@ -40,8 +40,8 @@ jobs: name: "Dask 2025.2.0" - os: macos-latest python: "3.11" - dask-version: "2025.10.0" - name: "Dask 2025.10.0" + dask-version: "latest" + name: "Dask latest" - os: macos-latest python: "3.12" pip-flags: "--pre" From fafede5d54af6a12e9219d22121f8d4a46fe4d43 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Wed, 5 Nov 2025 00:19:03 +0100 Subject: [PATCH 50/53] move 3.13 to include --- .github/workflows/test.yaml | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 46d6f6727..081dbb978 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -18,9 +18,6 @@ jobs: strategy: fail-fast: false matrix: - python: ["3.11", "3.13"] - os: [ubuntu-latest] - include: - os: windows-latest python: "3.11" @@ -34,6 +31,14 @@ jobs: python: "3.11" dask-version: "2025.2.0" name: "Dask 2025.2.0" + - os: ubuntu-latest + python: "3.11" + dask-version: "latest" + name: "Dask latest" + - os: ubuntu-latest + python: "3.13" + dask-version: "latest" + name: "Dask latest" - os: macos-latest python: "3.11" dask-version: "2025.2.0" From a06302d641bd22a35bb5f99122d6dfdaef357e36 Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Wed, 5 Nov 2025 00:33:53 +0100 Subject: [PATCH 51/53] make more concise --- .github/workflows/test.yaml | 45 +++++++++---------------------------- .pre-commit-config.yaml | 1 + 2 files changed, 11 insertions(+), 35 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 081dbb978..ec753d95f 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -4,7 +4,7 @@ on: push: branches: [main] tags: - - "v*" # Push events to matching v*, i.e. v1.0, v20.15.10 + - "v*" pull_request: branches: "*" @@ -13,45 +13,20 @@ jobs: runs-on: ${{ matrix.os }} defaults: run: - shell: bash -e {0} # -e to fail on error + shell: bash -e {0} strategy: fail-fast: false matrix: include: - - os: windows-latest - python: "3.11" - dask-version: "2025.2.0" - name: "Dask 2025.2.0" - - os: windows-latest - python: "3.11" - dask-version: "latest" - name: "Dask latest" - - os: ubuntu-latest - python: "3.11" - dask-version: "2025.2.0" - name: "Dask 2025.2.0" - - os: ubuntu-latest - python: "3.11" - dask-version: "latest" - name: "Dask latest" - - os: ubuntu-latest - python: "3.13" - dask-version: "latest" - name: "Dask latest" - - os: macos-latest - python: "3.11" - dask-version: "2025.2.0" - name: "Dask 2025.2.0" - - os: macos-latest - python: "3.11" - dask-version: "latest" - name: "Dask latest" - - os: macos-latest - python: "3.12" - pip-flags: "--pre" - name: "Python 3.12 (pre-release)" - + - {os: windows-latest, python: "3.11", dask-version: "2025.2.0", name: "Dask 2025.2.0"} + - {os: windows-latest, python: "3.11", dask-version: "latest", name: "Dask latest"} + - {os: ubuntu-latest, python: "3.11", dask-version: "2025.2.0", name: "Dask 2025.2.0"} + - {os: ubuntu-latest, python: "3.11", dask-version: "latest", name: "Dask latest"} + - {os: ubuntu-latest, python: "3.13", dask-version: "latest", name: "Dask latest"} + - {os: macos-latest, python: "3.11", dask-version: "2025.2.0", name: "Dask 2025.2.0"} + - {os: macos-latest, python: "3.11", dask-version: "latest", name: "Dask latest"} + - {os: macos-latest, python: "3.12", pip-flags: "--pre", name: "Python 3.12 (pre-release)"} env: OS: ${{ matrix.os }} PYTHON: ${{ matrix.python }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f181578ea..1e54869b6 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -12,6 +12,7 @@ repos: rev: v3.5.3 hooks: - id: prettier + exclude: ^.github/workflows/test.yaml - repo: https://github.com/pre-commit/mirrors-mypy rev: v1.15.0 hooks: From 990891a883d7d67f451eb7a331473a5816ed22cc Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Wed, 5 Nov 2025 00:45:04 +0100 Subject: [PATCH 52/53] Apply suggestion from @melonora --- src/spatialdata/_core/operations/rasterize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/spatialdata/_core/operations/rasterize.py b/src/spatialdata/_core/operations/rasterize.py index 423cca766..6da0a7cc8 100644 --- a/src/spatialdata/_core/operations/rasterize.py +++ b/src/spatialdata/_core/operations/rasterize.py @@ -585,7 +585,6 @@ def rasterize_images_labels( raise ValueError(f"Unsupported schema {schema}") # resample the image - # TODO results here are vastly different between dask 2025.1.0 and 2025.2.0, from the latter on tests fail. transformed_dask = dask_image.ndinterp.affine_transform( xdata.data, matrix=matrix, From 72121d35e2f73aae3b0ba66282b99e29d7b7499b Mon Sep 17 00:00:00 2001 From: Wouter-Michiel Vierdag Date: Sat, 22 Nov 2025 19:31:50 +0100 Subject: [PATCH 53/53] fix str representation --- src/spatialdata/_core/spatialdata.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/spatialdata/_core/spatialdata.py b/src/spatialdata/_core/spatialdata.py index 3d6a9ed06..f92bc9f54 100644 --- a/src/spatialdata/_core/spatialdata.py +++ b/src/spatialdata/_core/spatialdata.py @@ -13,8 +13,7 @@ import zarr from anndata import AnnData from dask.dataframe import DataFrame as DaskDataFrame -from dask.dataframe import read_parquet -from dask.delayed import Delayed +from dask.dataframe import Scalar, read_parquet from geopandas import GeoDataFrame from shapely import MultiPolygon, Polygon from xarray import DataArray, DataTree @@ -1985,9 +1984,7 @@ def h(s: str) -> str: else: shape_str = ( "(" - + ", ".join( - [(str(dim) if not isinstance(dim, Delayed) else "") for dim in v.shape] - ) + + ", ".join([(str(dim) if not isinstance(dim, Scalar) else "") for dim in v.shape]) + ")" ) descr += f"{h(attr + 'level1.1')}{k!r}: {descr_class} with shape: {shape_str} {dim_string}"