NCAS-CMS · davidhassell · Jan 8, 2026 · Aug 22, 2025 · Aug 22, 2025 · Aug 23, 2025
diff --git a/Changelog.rst b/Changelog.rst
@@ -1,8 +1,16 @@
 Version NEXTVERSION
--------------------
-
-**2025-12-??**
 
+**2026-01-??**
+
+* Write Zarr v3 datasets with `cfdm.write`, and allow the reading of
+  grouped Zarr v2 and v3 datasets with `cfdm.read`
+  (https://github.com/NCAS-CMS/cfdm/issues/354)
+* Read Zarr v2 and v3 datasets that contain a group hierarchy with
+  `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/355)
+* New function `cfdm.dataset_flatten` that replaces the deprecated
+  `cfdm.netcdf_flatten` (https://github.com/NCAS-CMS/cfdm/issues/355)
+* New optional dependency: ``zarr>=3.1.3``
+* Removed dependency (now optional): ``zarr>=3.0.8``
 * Reduce the time taken to import `cfdm`
   (https://github.com/NCAS-CMS/cfdm/issues/361)
 
@@ -29,7 +37,6 @@ Version 1.12.3.0
 
 **2025-08-18**
 
-
 * Fix `cfdm.Data.reshape` when the underlying data originate on disk
   (https://github.com/NCAS-CMS/cfdm/issues/348)
 * New keyword parameter to `cfdm.Field.dump`: ``data``
@@ -57,7 +64,7 @@ Version 1.12.2.0
   retrieved from disk (https://github.com/NCAS-CMS/cfdm/issues/313)
 * New keyword parameter to `cfdm.write`: ``chunk_cache``
   (https://github.com/NCAS-CMS/cfdm/issues/328)
-* Read Zarr datasets with `cfdm.read`
+* Read Zarr v2 and v3 datasets with `cfdm.read`
   (https://github.com/NCAS-CMS/cfdm/issues/335)
 * Read multiple datasets simultaneously with `cfdm.read`
   (https://github.com/NCAS-CMS/cfdm/issues/336)

diff --git a/README.md b/README.md
@@ -80,10 +80,12 @@ inspecting it:
 The ``cfdm`` package can:
 
 * read field and domain constructs from netCDF, CDL, and Zarr datasets
-  with a choice of netCDF backends, and in  local, http, and s3 locations,
-* be fully flexible with respect to HDF5 chunking,
+  with a choice of netCDF backends, and in local, http, and s3
+  locations,
+* be fully flexible with respect to dataset storage chunking,
 * create new field and domain constructs in memory,
-* write and append field and domain constructs to netCDF datasets on disk,
+* write and append field and domain constructs to netCDF and Zarr v3
+  datasets on disk,
 * read, write, and manipulate UGRID mesh topologies,
 * read, write, and create coordinates defined by geometry cells,
 * read and write netCDF4 string data-type variables,

diff --git a/cfdm/__init__.py b/cfdm/__init__.py
@@ -75,6 +75,7 @@
     is_log_level_debug,
     is_log_level_detail,
     is_log_level_info,
+    netcdf_flatten,
 )
 
 # Though these are internal-use methods, include them in the namespace
@@ -161,7 +162,7 @@
 from .cfdmimplementation import CFDMImplementation, implementation
 
 from .read_write import read, write
-from .read_write.netcdf.flatten import netcdf_flatten
+from .read_write.netcdf.flatten import dataset_flatten
 
 from .examplefield import example_field, example_fields, example_domain
 

diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py
@@ -1103,6 +1103,25 @@ def nc_get_dataset_chunksizes(self, data):
         """
         return data.nc_dataset_chunksizes()
 
+    def nc_get_dataset_shards(self, data):
+        """Get the Zarr dataset sharding strategy for the data.
+
+        .. versionadded:: (cfdm) NEXTVERSION
+
+        .. seealso:: `nc_set_dataset_shards`
+
+        :Parameters:
+
+            data: `Data`
+
+        :Returns:
+
+            `int` or `tuple` or `None`
+                The dataset sharding strategy.
+
+        """
+        return data.nc_dataset_shards()
+
     def nc_get_sample_dimension(self, count, default=None):
         """Return the name of the netCDF sample dimension.
 
@@ -1245,6 +1264,28 @@ def nc_set_hdf5_chunksizes(self, data, chunksizes):
             "Use 'nc_set_dataset_chunksizes' instead."
         )
 
+    def nc_set_dataset_shards(self, data, shards):
+        """Set the Zarr dataset sharding strategy for the data.
+
+        .. versionadded:: (cfdm) NEXTVERSION
+
+        .. seealso:: `nc_get_dataset_shards`
+
+        :Parameters:
+
+            data: `Data`
+
+            shards: `None` or `int` or sequence of `int`
+                Set the sharding strategy when writing to a Zarr
+                dataset.
+
+        :Returns:
+
+            `None`
+
+        """
+        return data.nc_set_dataset_shards(shards)
+
     def parameters(self, parent):
         """Return all parameters from a component.
 

diff --git a/cfdm/data/aggregatedarray.py b/cfdm/data/aggregatedarray.py
@@ -262,9 +262,17 @@ def _parse_fragment_array(self, aggregated_filename, fragment_array):
                 if not scalar:
                     identifier = fa_identifiers[index].item()
 
+                uri = fa_uris[index]
+                try:
+                    # 'uri' is scalar numpy string type
+                    uri = uri.item()
+                except AttributeError:
+                    # E.g. 'uri' is already a `str` instance
+                    pass
+
                 parsed_fragment_array[index] = {
                     "map": shape,
-                    "uri": fa_uris[index].item(),
+                    "uri": uri,
                     "identifier": identifier,
                 }
         else:

diff --git a/cfdm/data/data.py b/cfdm/data/data.py
@@ -23,7 +23,7 @@
 )
 from ..mixin.container import Container
 from ..mixin.files import Files
-from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks
+from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks, NetCDFShards
 from ..units import Units
 from .abstract import Array
 from .creation import to_dask
@@ -50,7 +50,9 @@
 logger = logging.getLogger(__name__)
 
 
-class Data(Container, NetCDFAggregation, NetCDFChunks, Files, core.Data):
+class Data(
+    Container, NetCDFAggregation, NetCDFChunks, NetCDFShards, Files, core.Data
+):
     """An N-dimensional data array with units and masked values.
 
     * Contains an N-dimensional, indexable and broadcastable array with

diff --git a/cfdm/data/fragment/__init__.py b/cfdm/data/fragment/__init__.py
@@ -2,3 +2,4 @@
 from .fragmenth5netcdfarray import FragmentH5netcdfArray
 from .fragmentnetcdf4array import FragmentNetCDF4Array
 from .fragmentuniquevaluearray import FragmentUniqueValueArray
+from .fragmentzarrarray import FragmentZarrArray
diff --git a/cfdm/data/fragment/fragmentfilearray.py b/cfdm/data/fragment/fragmentfilearray.py
@@ -26,12 +26,17 @@ def __new__(cls, *args, **kwargs):
         """
         # Import fragment classes. Do this here (as opposed to outside
         # the class) to aid subclassing.
-        from . import FragmentH5netcdfArray, FragmentNetCDF4Array
+        from . import (
+            FragmentH5netcdfArray,
+            FragmentNetCDF4Array,
+            FragmentZarrArray,
+        )
 
         instance = super().__new__(cls)
         instance._FragmentArrays = (
             FragmentNetCDF4Array,
             FragmentH5netcdfArray,
+            FragmentZarrArray,
         )
         return instance
 

diff --git a/cfdm/data/fragment/fragmentzarrarray.py b/cfdm/data/fragment/fragmentzarrarray.py
@@ -0,0 +1,10 @@
+from ..zarrarray import ZarrArray
+from .mixin import FragmentFileArrayMixin
+
+
+class FragmentZarrArray(FragmentFileArrayMixin, ZarrArray):
+    """A fragment of aggregated data in a file accessed with `zarr`.
+
+    .. versionadded:: (cfdm) NEXTVERSION
+
+    """
diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py
@@ -194,14 +194,15 @@ def get_groups(self, address):
         return out[:-1], out[-1]
 
     def open(self, **kwargs):
-        """Return a dataset file object and address.
-
-        When multiple files have been provided an attempt is made to
-        open each one, in the order stored, and a file object is
-        returned from the first file that exists.
+        """Return a dataset object and address.
 
         .. versionadded:: (cfdm) 1.11.2.0
 
+        :Parameters:
+
+            kwargs: optional
+                Extra keyword arguments to `h5netcdf.File`.
+
         :Returns:
 
             (`h5netcdf.File`, `str`)

diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py
@@ -205,12 +205,13 @@ def get_groups(self, address):
         out = address.split("/")[1:]
         return out[:-1], out[-1]
 
-    def open(self):
-        """Return a dataset file object and address.
+    def open(self, **kwargs):
+        """Return a dataset object and address.
 
-        When multiple files have been provided an attempt is made to
-        open each one, in the order stored, and a file object is
-        returned from the first file that exists.
+        :Parameters:
+
+            kwargs: optional
+                Extra keyword arguments to `netCDF4.Dataset`.
 
         :Returns:
 
@@ -221,4 +222,4 @@ def open(self):
         """
         import netCDF4
 
-        return super().open(netCDF4.Dataset, mode="r")
+        return super().open(netCDF4.Dataset, mode="r", **kwargs)
diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py
@@ -32,7 +32,7 @@
 class netcdf_indexer:
     """A data indexer that also applies netCDF masking and unpacking.
 
-    Here "netCDF4" refers to the API of the netCDF data model, rather
+    Here "netCDF" refers to the API of the netCDF data model, rather
     than any particular dataset encoding or software library API.
 
     Indexing may be orthogonal or non-orthogonal. Orthogonal indexing
@@ -395,9 +395,15 @@ def _default_FillValue(self, dtype):
         """
         from netCDF4 import default_fillvals
 
-        if dtype.kind in "OS":
+        kind = dtype.kind
+        if kind in "OS":
             return default_fillvals["S1"]
 
+        if kind == "T":
+            # np.dtypes.StringDType, which stores variable-width
+            # string data in a UTF-8 encoding, as used by `zarr`
+            return ""
+
         return default_fillvals[dtype.str[1:]]
 
     def _index(self, index, data=None):
@@ -623,7 +629,10 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int):
             if fvalisnan:
                 mask = np.isnan(data)
             else:
-                mask = data == fval
+                # Must use `np.asanyarray` here, to ensure that
+                # 'mask' is a never a `bool`, which would make the
+                # following 'mask.any' call fail.
+                mask = np.asanyarray(data == fval)
 
             if mask.any():
                 if fill_value is None:

diff --git a/cfdm/data/zarrarray.py b/cfdm/data/zarrarray.py
@@ -98,18 +98,30 @@ def close(self, dataset):
         # `zarr.Group` objects don't need closing
         pass
 
-    def open(self):
-        """Return a dataset file object and address.
+    def open(self, **kwargs):
+        """Return a dataset object and address.
 
         .. versionadded:: (cfdm) 1.12.2.0
 
+        :Parameters:
+
+            kwargs: optional
+                Extra keyword arguments to `zarr.open`.
+
         :Returns:
 
             (`zarr.Group`, `str`)
                 The dataset object open in read-only mode, and the
                 variable name of the data within the dataset.
 
         """
-        import zarr
-
-        return super().open(zarr.open, mode="r")
+        try:
+            import zarr
+        except ModuleNotFoundError as error:
+            error.msg += (
+                ". Install the 'zarr' package "
+                "(https://pypi.org/project/zarr) to read Zarr datasets"
+            )
+            raise
+
+        return super().open(zarr.open, mode="r", **kwargs)