diff --git a/Changelog.rst b/Changelog.rst index 73bf1fb76..1bcd39f33 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,8 +1,16 @@ Version NEXTVERSION -------------------- - -**2025-12-??** +**2026-01-??** + +* Write Zarr v3 datasets with `cfdm.write`, and allow the reading of + grouped Zarr v2 and v3 datasets with `cfdm.read` + (https://github.com/NCAS-CMS/cfdm/issues/354) +* Read Zarr v2 and v3 datasets that contain a group hierarchy with + `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/355) +* New function `cfdm.dataset_flatten` that replaces the deprecated + `cfdm.netcdf_flatten` (https://github.com/NCAS-CMS/cfdm/issues/355) +* New optional dependency: ``zarr>=3.1.3`` +* Removed dependency (now optional): ``zarr>=3.0.8`` * Reduce the time taken to import `cfdm` (https://github.com/NCAS-CMS/cfdm/issues/361) @@ -29,7 +37,6 @@ Version 1.12.3.0 **2025-08-18** - * Fix `cfdm.Data.reshape` when the underlying data originate on disk (https://github.com/NCAS-CMS/cfdm/issues/348) * New keyword parameter to `cfdm.Field.dump`: ``data`` @@ -57,7 +64,7 @@ Version 1.12.2.0 retrieved from disk (https://github.com/NCAS-CMS/cfdm/issues/313) * New keyword parameter to `cfdm.write`: ``chunk_cache`` (https://github.com/NCAS-CMS/cfdm/issues/328) -* Read Zarr datasets with `cfdm.read` +* Read Zarr v2 and v3 datasets with `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/335) * Read multiple datasets simultaneously with `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/336) diff --git a/README.md b/README.md index f6bcfff77..e9d4af6a4 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,12 @@ inspecting it: The ``cfdm`` package can: * read field and domain constructs from netCDF, CDL, and Zarr datasets - with a choice of netCDF backends, and in local, http, and s3 locations, -* be fully flexible with respect to HDF5 chunking, + with a choice of netCDF backends, and in local, http, and s3 + locations, +* be fully flexible with respect to dataset storage chunking, * create new field and domain constructs in memory, -* write and append field and domain constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr v3 + datasets on disk, * read, write, and manipulate UGRID mesh topologies, * read, write, and create coordinates defined by geometry cells, * read and write netCDF4 string data-type variables, diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 31373db6f..f5f8a6db7 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -75,6 +75,7 @@ is_log_level_debug, is_log_level_detail, is_log_level_info, + netcdf_flatten, ) # Though these are internal-use methods, include them in the namespace @@ -161,7 +162,7 @@ from .cfdmimplementation import CFDMImplementation, implementation from .read_write import read, write -from .read_write.netcdf.flatten import netcdf_flatten +from .read_write.netcdf.flatten import dataset_flatten from .examplefield import example_field, example_fields, example_domain diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 11c6fefce..3e3a407d2 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1103,6 +1103,25 @@ def nc_get_dataset_chunksizes(self, data): """ return data.nc_dataset_chunksizes() + def nc_get_dataset_shards(self, data): + """Get the Zarr dataset sharding strategy for the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_set_dataset_shards` + + :Parameters: + + data: `Data` + + :Returns: + + `int` or `tuple` or `None` + The dataset sharding strategy. + + """ + return data.nc_dataset_shards() + def nc_get_sample_dimension(self, count, default=None): """Return the name of the netCDF sample dimension. @@ -1245,6 +1264,28 @@ def nc_set_hdf5_chunksizes(self, data, chunksizes): "Use 'nc_set_dataset_chunksizes' instead." ) + def nc_set_dataset_shards(self, data, shards): + """Set the Zarr dataset sharding strategy for the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_get_dataset_shards` + + :Parameters: + + data: `Data` + + shards: `None` or `int` or sequence of `int` + Set the sharding strategy when writing to a Zarr + dataset. + + :Returns: + + `None` + + """ + return data.nc_set_dataset_shards(shards) + def parameters(self, parent): """Return all parameters from a component. diff --git a/cfdm/data/aggregatedarray.py b/cfdm/data/aggregatedarray.py index 0afbf070f..c271bf453 100644 --- a/cfdm/data/aggregatedarray.py +++ b/cfdm/data/aggregatedarray.py @@ -262,9 +262,17 @@ def _parse_fragment_array(self, aggregated_filename, fragment_array): if not scalar: identifier = fa_identifiers[index].item() + uri = fa_uris[index] + try: + # 'uri' is scalar numpy string type + uri = uri.item() + except AttributeError: + # E.g. 'uri' is already a `str` instance + pass + parsed_fragment_array[index] = { "map": shape, - "uri": fa_uris[index].item(), + "uri": uri, "identifier": identifier, } else: diff --git a/cfdm/data/data.py b/cfdm/data/data.py index 23480aa72..b2ae1520e 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -23,7 +23,7 @@ ) from ..mixin.container import Container from ..mixin.files import Files -from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks +from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks, NetCDFShards from ..units import Units from .abstract import Array from .creation import to_dask @@ -50,7 +50,9 @@ logger = logging.getLogger(__name__) -class Data(Container, NetCDFAggregation, NetCDFChunks, Files, core.Data): +class Data( + Container, NetCDFAggregation, NetCDFChunks, NetCDFShards, Files, core.Data +): """An N-dimensional data array with units and masked values. * Contains an N-dimensional, indexable and broadcastable array with diff --git a/cfdm/data/fragment/__init__.py b/cfdm/data/fragment/__init__.py index 8c7c76bd0..e4edf076c 100644 --- a/cfdm/data/fragment/__init__.py +++ b/cfdm/data/fragment/__init__.py @@ -2,3 +2,4 @@ from .fragmenth5netcdfarray import FragmentH5netcdfArray from .fragmentnetcdf4array import FragmentNetCDF4Array from .fragmentuniquevaluearray import FragmentUniqueValueArray +from .fragmentzarrarray import FragmentZarrArray diff --git a/cfdm/data/fragment/fragmentfilearray.py b/cfdm/data/fragment/fragmentfilearray.py index 0e4e9c07a..4241f1c8c 100644 --- a/cfdm/data/fragment/fragmentfilearray.py +++ b/cfdm/data/fragment/fragmentfilearray.py @@ -26,12 +26,17 @@ def __new__(cls, *args, **kwargs): """ # Import fragment classes. Do this here (as opposed to outside # the class) to aid subclassing. - from . import FragmentH5netcdfArray, FragmentNetCDF4Array + from . import ( + FragmentH5netcdfArray, + FragmentNetCDF4Array, + FragmentZarrArray, + ) instance = super().__new__(cls) instance._FragmentArrays = ( FragmentNetCDF4Array, FragmentH5netcdfArray, + FragmentZarrArray, ) return instance diff --git a/cfdm/data/fragment/fragmentzarrarray.py b/cfdm/data/fragment/fragmentzarrarray.py new file mode 100644 index 000000000..3ea293df2 --- /dev/null +++ b/cfdm/data/fragment/fragmentzarrarray.py @@ -0,0 +1,10 @@ +from ..zarrarray import ZarrArray +from .mixin import FragmentFileArrayMixin + + +class FragmentZarrArray(FragmentFileArrayMixin, ZarrArray): + """A fragment of aggregated data in a file accessed with `zarr`. + + .. versionadded:: (cfdm) NEXTVERSION + + """ diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 7e1b45de7..913609318 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -194,14 +194,15 @@ def get_groups(self, address): return out[:-1], out[-1] def open(self, **kwargs): - """Return a dataset file object and address. - - When multiple files have been provided an attempt is made to - open each one, in the order stored, and a file object is - returned from the first file that exists. + """Return a dataset object and address. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + kwargs: optional + Extra keyword arguments to `h5netcdf.File`. + :Returns: (`h5netcdf.File`, `str`) diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 5f515da71..ccf0d1e1c 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -205,12 +205,13 @@ def get_groups(self, address): out = address.split("/")[1:] return out[:-1], out[-1] - def open(self): - """Return a dataset file object and address. + def open(self, **kwargs): + """Return a dataset object and address. - When multiple files have been provided an attempt is made to - open each one, in the order stored, and a file object is - returned from the first file that exists. + :Parameters: + + kwargs: optional + Extra keyword arguments to `netCDF4.Dataset`. :Returns: @@ -221,4 +222,4 @@ def open(self): """ import netCDF4 - return super().open(netCDF4.Dataset, mode="r") + return super().open(netCDF4.Dataset, mode="r", **kwargs) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 8a2d4cd0a..4f1efd370 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -32,7 +32,7 @@ class netcdf_indexer: """A data indexer that also applies netCDF masking and unpacking. - Here "netCDF4" refers to the API of the netCDF data model, rather + Here "netCDF" refers to the API of the netCDF data model, rather than any particular dataset encoding or software library API. Indexing may be orthogonal or non-orthogonal. Orthogonal indexing @@ -395,9 +395,15 @@ def _default_FillValue(self, dtype): """ from netCDF4 import default_fillvals - if dtype.kind in "OS": + kind = dtype.kind + if kind in "OS": return default_fillvals["S1"] + if kind == "T": + # np.dtypes.StringDType, which stores variable-width + # string data in a UTF-8 encoding, as used by `zarr` + return "" + return default_fillvals[dtype.str[1:]] def _index(self, index, data=None): @@ -623,7 +629,10 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): if fvalisnan: mask = np.isnan(data) else: - mask = data == fval + # Must use `np.asanyarray` here, to ensure that + # 'mask' is a never a `bool`, which would make the + # following 'mask.any' call fail. + mask = np.asanyarray(data == fval) if mask.any(): if fill_value is None: diff --git a/cfdm/data/zarrarray.py b/cfdm/data/zarrarray.py index cfa1ccf5c..4f4e46a7a 100644 --- a/cfdm/data/zarrarray.py +++ b/cfdm/data/zarrarray.py @@ -98,11 +98,16 @@ def close(self, dataset): # `zarr.Group` objects don't need closing pass - def open(self): - """Return a dataset file object and address. + def open(self, **kwargs): + """Return a dataset object and address. .. versionadded:: (cfdm) 1.12.2.0 + :Parameters: + + kwargs: optional + Extra keyword arguments to `zarr.open`. + :Returns: (`zarr.Group`, `str`) @@ -110,6 +115,13 @@ def open(self): variable name of the data within the dataset. """ - import zarr - - return super().open(zarr.open, mode="r") + try: + import zarr + except ModuleNotFoundError as error: + error.msg += ( + ". Install the 'zarr' package " + "(https://pypi.org/project/zarr) to read Zarr datasets" + ) + raise + + return super().open(zarr.open, mode="r", **kwargs) diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 8892b8ffe..262198b8c 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -536,7 +536,6 @@ None}``.""", # read store_dataset_chunks "{{read store_dataset_chunks: `bool`, optional}}": """store_dataset_chunks: `bool`, optional - If True (the default) then store the dataset chunking strategy for each returned data array. The dataset chunking strategy is then accessible via an object's @@ -545,17 +544,33 @@ written to a new netCDF file with `{{package}}.write` (unless the strategy is modified prior to writing). - If False, or if the dataset being read does not support - chunking (such as a netCDF-3 dataset), then no dataset - chunking strategy is stored (i.e. an - `nc_dataset_chunksizes` method will return `None` for all - `Data` objects). In this case, when the data is written to - a new netCDF file, the dataset chunking strategy will be - determined by `{{package}}.write`. + If False then no dataset chunking strategy is stored + (i.e. the `nc_dataset_chunksizes` method will return + `None` for all returned `Data` objects). In this case, + when the data is written to a new dataset, the dataset + chunking strategy will be determined by + `{{package}}.write`. See the `{{package}}.write` *dataset_chunks* parameter for details on how the dataset chunking strategy is determined at the time of writing.""", + # read store_dataset_shards + "{{read store_dataset_shards: `bool`, optional}}": """store_dataset_shards: `bool`, optional + If True (the default) then store the Zarr dataset sharding + strategy for each returned data array. The dataset + sharding strategy is then accessible via an object's + `nc_dataset_shards` method. When the dataset sharding + strategy is stored, it will be used when the data is + written to a new Zarr dataset with `{{package}}.write` + (unless the strategy is modified prior to writing). + + If False, or if the dataset being read does not support + sharding (such as a Zarr v2 or netCDF dataset), then no + dataset sharding strategy is stored (i.e. the + `nc_dataset_shards` method will return `None` for all + returned `Data` objects). In this case, when the data is + written to a new Zarr dataset, the dataset sharding + strategy will be determined by `{{package}}.write`.""", # read cfa "{{read cfa: `dict`, optional}}": """cfa: `dict`, optional Configure the reading of CF-netCDF aggregation files. @@ -703,6 +718,61 @@ Note that setting ``recursive=True, followlinks=True`` can lead to infinite recursion if a symbolic link points to a parent directory of itself.""", + # read group_dimension_search + "{{read group_dimension_search: `str`, optional}}": """group_dimension_search: `str`, optional + How to interpret a dimension name that contains no + group-separator characters, such as ``dim`` (as opposed to + ``group/dim``, ``/group/dim``, ``../dim``, etc.). The + *group_dimension_search* parameter must be one of: + + * ``'closest_ancestor'`` + + This is the default and is the behaviour defined by the + CF conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + dimension with the same name and size in an ancestor + group, if one exists. If multiple such dimensions exist, + then the correspondence is with the dimension in the + ancestor group that is **closest** to the sub-group + (i.e. that is furthest away from the root group). + + * ``'furthest_ancestor'`` + + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + one with the same name and size in an ancestor group, if + one exists. If multiple such dimensions exist, then the + correspondence is with the dimension in the ancestor + group that is **furthest away** from the sub-group + (i.e. that is closest to the root group). + + * ``'local'`` + + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + + Assume that the Zarr sub-group dimension is different to + any with the same name and size in all ancestor groups. + + .. note:: For a netCDF dataset, for which it is always + well-defined in which group a dimension is + defined, *group_dimension_search* may only take + the default value of ``'closest_ancestor'`, + which applies the behaviour defined by the CF + conventions (section 2.7 Groups). + + For a Zarr dataset, for which there is no means + of indicating whether or not the same dimension + names that appear in different groups correspond + to each other, setting this parameter may be + necessary for the correct interpretation of the + dataset in the event that its dimensions are + named in a manner that is inconsistent with CF + rules defined by the CF conventions (section 2.7 + Groups).""", # persist "{{persist description}}": """Persisting turns an underlying lazy dask array into an equivalent chunked dask array, but now with the results fully @@ -1313,6 +1383,20 @@ names, are normalised to absolute paths prior to the replacement. If False (the default) then no normalisation is done.""", + # sharding + "{{sharding description}}": """ + When writing to a Zarr dataset, sharding provides a mechanism + to store multiple dataset chunks in a single storage object or + file. Without sharding, each dataset chunk is written to its + own file. Traditional file systems and object storage systems + may have performance issues storing and accessing large number + of files, and small files can be inefficient to store if they + are smaller than the block size of the file system. Sharding + can improve performance by creating fewer, and larger, files + for storing the dataset chunks. + + The sharding strategy is ignored when writing to a non-Zarr + dataset.""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- @@ -1340,4 +1424,18 @@ "{{Returns original filenames}}": """The original file names in normalised absolute form. If there are no original files then an empty `set` will be returned.""", + # sharding options + "{{sharding options}}": """* `None` + + No sharding. + + * `int` + + The integer number of dataset chunks to be stored in + a single shard, favouring an equal number of dataset + chunks along each shard dimension. + + * sequence of `int` + + The number of chunks along each shard dimension.""", } diff --git a/cfdm/functions.py b/cfdm/functions.py index 4c515f987..760ced48b 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -406,7 +406,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 /home/miniconda3/lib/python3.12/site-packages/netCDF4/__init__.py h5netcdf: 1.3.0 /home/miniconda3/lib/python3.12/site-packages/h5netcdf/__init__.py h5py: 3.12.1 /home/miniconda3/lib/python3.12/site-packages/h5py/__init__.py - zarr: 3.0.8 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py + zarr: 3.1.3 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py s3fs: 2024.12.0 /home/miniconda3/lib/python3.12/site-packages/s3fs/__init__.py scipy: 1.15.1 /home/miniconda3/lib/python3.12/site-packages/scipy/__init__.py dask: 2025.5.1 /home/miniconda3/lib/python3.12/site-packages/dask/__init__.py @@ -426,7 +426,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 h5netcdf: 1.3.0 h5py: 3.12.1 - zarr: 3.0.8 + zarr: 3.1.3 s3fs: 2024.12.0 scipy: 1.15.1 dask: 2025.5.1 @@ -2430,6 +2430,21 @@ def parse_indices(shape, indices, keepdims=True, newaxis=False): return parsed_indices +def netcdf_flatten(*args, **kwargs): + """Create a flattened version of a grouped CF dataset. + + .. versionadded:: (cfdm) 1.11.2.0 + + """ + _DEPRECATION_ERROR_FUNCTION( + "netcdf_flatten", + "Use 'cfdm.dataset_flatten' instead, " + "which has a slightly different API.", + version="NEXTVERSION", + removed_at="1.15.0.0", + ) # pragma: no cover + + def _DEPRECATION_ERROR_KWARGS( instance, method, @@ -2479,3 +2494,20 @@ def _DEPRECATION_ERROR_METHOD( f"at version {version} and is no longer available{removed_at}. " f"{message}" ) + + +def _DEPRECATION_ERROR_FUNCTION( + func, message="", version=None, removed_at=None +): + """Error handling for deprecated functions. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + if removed_at: + removed_at = f" and will be removed at version {removed_at}" + + raise DeprecationError( + f"Function {func!r} has been deprecated at version {version} and is " + f"no longer available{removed_at}. {message}" + ) diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index eb02eaed5..2da9e9ee1 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -2543,7 +2543,7 @@ def nc_dataset_chunksizes(self, todict=False): .. seealso:: `nc_clear_dataset_chunksizes`, `nc_set_dataset_chunksizes`, `{{package}}.read`, - `{{package}}.write` + `nc_dataset_shards`, `{{package}}.write` :Parameters: @@ -2614,7 +2614,7 @@ def nc_clear_dataset_chunksizes(self): .. seealso:: `nc_dataset_chunksizes`, `nc_set_dataset_chunksizes`, `{{package}}.read`, - `{{package}}.write` + `nc_dataset_shards`, `{{package}}.write` :Returns: @@ -2664,7 +2664,8 @@ def nc_set_dataset_chunksizes(self, chunksizes): .. seealso:: `nc_dataset_chunksizes`, `nc_clear_dataset_chunksizes`, - `{{package}}.read`, `{{package}}.write` + `nc_dataset_shards`, `{{package}}.read`, + `{{package}}.write` :Parameters: @@ -5129,3 +5130,180 @@ def nc_set_aggregation_write_status(self, status): ) self._nc_set_aggregation_write_status(status) + + +class NetCDFShards(NetCDFMixin): + """Mixin class for accessing Zarr dataset shard size. + + When writing to a Zarr dataset, sharding provides a mechanism to + store multiple chunks in a single storage object or file. This can + be useful because traditional file systems and object storage + systems may have performance issues storing and accessing a large + number of files. Additionally, small files can be inefficient to + store if they are smaller than the block size of the file system. + + The sharding strategy is ignored when writing to a non-Zarr + dataset. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def nc_dataset_shards(self): + """Get the Zarr dataset shard size for the data. + + {{sharding description}} + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_clear_dataset_shards`, + `nc_set_dataset_shards`, `nc_dataset_chunksizes`, + `{{package}}.write` + + :Returns: + + `None` or `int` or sequence of `int` + The current sharding strategy. One of: + + {{sharding options}} + + **Examples** + + >>> d.shape + (1, 100, 200) + >>> d.nc_dataset_chunksizes() + (1, 30, 50) + >>> d.nc_set_dataset_shards(4) + >>> d.nc_dataset_shards() + 4 + >>> d.nc_clear_dataset_shards() + 4 + >>> print(d.nc_dataset_shards()) + None + >>> d.nc_set_dataset_shards((5, 4)) + >>> d.nc_dataset_shards() + (5, 4) + + """ + return self._get_netcdf().get("dataset_shards") + + def nc_clear_dataset_shards(self): + """Clear the Zarr dataset shard size for the data. + + {{sharding description}} + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_dataset_shards`, `nc_set_dataset_shards`, + `nc_dataset_chunksizes`, `{{package}}.write` + + :Returns: + + `None` or `int` or sequence of `int` + The cleared sharding strategy. One of: + + {{sharding options}} + + **Examples** + + >>> d.shape + (1, 100, 200) + >>> d.nc_dataset_chunksizes() + (1, 30, 50) + >>> d.nc_set_dataset_shards(4) + >>> d.nc_dataset_shards() + 4 + >>> d.nc_clear_dataset_shards() + 4 + >>> print(d.nc_dataset_shards()) + None + >>> d.nc_set_dataset_shards((5, 4)) + >>> d.nc_dataset_shards() + (5, 4) + + """ + return self._get_netcdf().pop("dataset_shards", None) + + def nc_set_dataset_shards(self, shards): + """Set the Zarr dataset sharding strategy for the data. + + {{sharding description}} + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_dataset_shards`, `nc_clear_dataset_shards`, + `nc_dataset_chunksizes`, `{{package}}.write` + + :Parameters: + + shards: `None` or `int` or sequence of `int` + The new sharding strategy. One of: + + {{sharding options}} + + *Example:* + For two dimensional data, the following are + equivalent: ``25`` and ``(5, 5)``. + + :Returns: + + `None` + + **Examples** + + >>> d.shape + (1, 100, 200) + >>> d.nc_dataset_chunksizes() + (1, 30, 50) + >>> d.nc_set_dataset_shards(4) + >>> d.nc_dataset_shards() + 4 + >>> d.nc_clear_dataset_shards() + 4 + >>> print(d.nc_dataset_shards()) + None + >>> d.nc_set_dataset_shards((5, 4)) + >>> d.nc_dataset_shards() + (5, 4) + >>> d.nc_set_dataset_shards(None) + >>> print(d.nc_dataset_shards()) + None + + """ + if shards is None: + self.nc_clear_dataset_shards() + return + + if isinstance(shards, Integral): + if shards < 1: + raise ValueError( + "'shards' must be None, a positive integer, or a " + f"sequence of positive integers. Got: {shards!r}" + ) + + self._set_netcdf("dataset_shards", shards) + return + + try: + shards = tuple(shards) + except TypeError: + raise ValueError( + "'shards' must be None, a positive integer, or a " + f"sequence of positive integers. Got: {shards!r}" + ) + + if len(shards) != len(self.shape): + raise ValueError( + f"When shards is a sequence it must have the same length as " + f"the number of data dimensions ({len(self.shape)}). " + f"Got: {shards!r} " + ) + + for n, i in enumerate(shards): + if not (isinstance(i, Integral) and i > 0): + raise ValueError( + f"Shard size for dimension position {n} must be " + f"a positive integer. Got: {i!r}" + ) + + self._set_netcdf("dataset_shards", shards) diff --git a/cfdm/read_write/abstract/abstractio.py b/cfdm/read_write/abstract/abstractio.py index ae1cde4d7..ccfa6a433 100644 --- a/cfdm/read_write/abstract/abstractio.py +++ b/cfdm/read_write/abstract/abstractio.py @@ -16,13 +16,13 @@ def __init__(self, implementation): self.implementation = implementation @abc.abstractmethod - def file_close(self, *args, **kwargs): - """Close the dataset file.""" + def dataset_close(self, *args, **kwargs): + """Close the dataset.""" raise NotImplementedError() # pragma: no cover @abc.abstractmethod - def file_open(self, *args, **kwargs): - """Open the dataset file.""" + def dataset_open(self, *args, **kwargs): + """Open the dataset.""" raise NotImplementedError() # pragma: no cover @@ -31,7 +31,7 @@ class IORead(IO, metaclass=abc.ABCMeta): @abc.abstractmethod def read(self, *args, **kwargs): - """Read fields from a netCDF file.""" + """Read fields from a netCDF dataset.""" raise NotImplementedError() # pragma: no cover @@ -40,5 +40,5 @@ class IOWrite(IO, metaclass=abc.ABCMeta): @abc.abstractmethod def write(self, *args, **kwargs): - """Write fields to a netCDF file.""" + """Write fields to a netCDF dataset.""" raise NotImplementedError() # pragma: no cover diff --git a/cfdm/read_write/netcdf/constants.py b/cfdm/read_write/netcdf/constants.py index a0e1c972e..c49a5645c 100644 --- a/cfdm/read_write/netcdf/constants.py +++ b/cfdm/read_write/netcdf/constants.py @@ -67,6 +67,9 @@ # NetCDF-4 file formats NETCDF4_FMTS = ("NETCDF4", "NETCDF4_CLASSIC") +# Zarr dataset formats +ZARR_FMTS = ("ZARR3",) + # -------------------------------------------------------------------- # Quantization # -------------------------------------------------------------------- diff --git a/cfdm/read_write/netcdf/flatten/__init__.py b/cfdm/read_write/netcdf/flatten/__init__.py index 82e6a3c9e..6c106911d 100644 --- a/cfdm/read_write/netcdf/flatten/__init__.py +++ b/cfdm/read_write/netcdf/flatten/__init__.py @@ -13,4 +13,4 @@ """ -from .flatten import netcdf_flatten +from .flatten import dataset_flatten diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 2a4c16c5e..ee51f0a22 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -16,6 +16,8 @@ import logging import warnings +from cfdm.functions import is_log_level_debug + from .config import ( flattener_attribute_map, flattener_dimension_map, @@ -27,6 +29,8 @@ ref_not_found_error, ) +logger = logging.getLogger(__name__) + # Mapping from numpy dtype endian format to that expected by netCDF4 _dtype_endian_lookup = { "=": "native", @@ -41,31 +45,33 @@ referencing_attributes = set(flattening_rules) -def netcdf_flatten( +def dataset_flatten( input_ds, output_ds, strict=True, - omit_data=False, - write_chunksize=134217728, + copy_data=True, + group_dimension_search="closest_ancestor", ): - """Create a flattened version of a grouped netCDF dataset. + """Create a flattened version of a grouped CF dataset. - **CF-netCDF coordinate variables** + The following dataset formats can be flattened: netCDF and Zarr. - When a CF-netCDF coordinate variable in the input dataset is in a + **CF coordinate variables** + + When a CF coordinate variable (i.e. a one-dimensional variable + with the same name as its dimension) in the input dataset is in a different group to its corresponding dimension, the same variable - in the output flattened dataset will no longer be a CF-netCDF - coordinate variable, as its name will be prefixed with a different - group identifier than its dimension. + in the output flattened dataset will no longer be a CF coordinate + variable, as its name will be prefixed with a different group + identifier than its dimension. In such cases it is up to the user to apply the proximal and lateral search algorithms to the flattened dataset returned by - `netcdf_flatten`, in conjunction with the mappings defined in the + `dataset_flatten`, in conjunction with the mappings defined in the newly created global attributes ``_flattener_variable_map`` and - ``_flattener_dimension_map``, to find which netCDF variables are - acting as CF coordinate variables in the flattened dataset. See - https://cfconventions.org/cf-conventions/cf-conventions.html#groups - for details. + ``_flattener_dimension_map``, to find which variables are acting + as CF coordinate variables in the flattened dataset. See CF + conventions section 2.7 Groups for details. For example, if an input dataset has dimension ``lat`` in the root group and coordinate variable ``lat(lat)`` in group ``/group1``, @@ -81,43 +87,94 @@ def netcdf_flatten( :Parameters: input_ds: - The dataset to be flattened, that has the same API as - `netCDF4.Dataset` or `h5netcdf.File`. + The dataset to be flattened. Must be an open dataet object + with the same API as `netCDF4.Dataset`, `h5netcdf.File`, + or `zarr.Group`. output_ds: `netCDF4.Dataset` - A container for the flattened dataset. + A container for the flattened dataset that will get + updated in-place with the flattened input dataset. strict: `bool`, optional If True, the default, then failing to resolve a reference raises an exception. If False, a warning is issued and flattening is continued. - omit_data: `bool`, optional - If True then do not copy the data of any variables from - *input_ds* to *output_ds*. This does not affect the amount - of netCDF variables and dimensions that are written to the - file, nor the netCDF variables' attributes, but for all - variables it does not create data on disk or in - memory. The resulting dataset will be smaller than it - otherwise would have been, and when the new dataset is - accessed the data of these variables will be represented - by an array of all missing data. If False, the default, - then all data arrays are copied. - - write_chunksize: `int`, optional - When *omit_data* is False, the copying of data is done - piecewise to keep memory usage down. *write_chunksize* is - the size in bytes of how much data is copied from - *input_ds* to *output_ds* for each piece. Ignored if - *omit_data* is True. + copy_data: `bool`, optional + By default, *copy_data* is True and all data arrays from + *input_ds* are copied to *output_ds*. If False then no + data arrays are copied, instead all variables' data will + be represented by the fill value, but without having to + actually create these arrays in memory or on disk. + + group_dimension_search: `str`, optional + How to interpret a dimension name that contains no + group-separator characters, such as ``dim`` (as opposed to + ``group/dim``, ``/group/dim``, ``../dim``, etc.). The + *group_dimension_search* parameter must be one of: + + * ``'closest_ancestor'`` + + This is the default and is the behaviour defined by the + CF conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + dimension with the same name and size in an ancestor + group, if one exists. If multiple such dimensions exist, + then the correspondence is with the dimension in the + ancestor group that is **closest** to the sub-group + (i.e. that is furthest away from the root group). + + * ``'furthest_ancestor'`` + + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + one with the same name and size in an ancestor group, if + one exists. If multiple such dimensions exist, then the + correspondence is with the dimension in the ancestor + group that is **furthest away** from the sub-group + (i.e. that is closest to the root group). + + * ``'local'`` + + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + + Assume that the sub-group dimension is different to any + with the same name and size in all ancestor groups. + + .. note:: For a netCDF dataset, for which it is always + well-defined in which group a dimension is + defined, *group_dimension_search* may only take + the default value of ``'closest_ancestor'``, + which applies the behaviour defined by the CF + conventions (section 2.7 Groups). + + For a Zarr dataset, for which there is no means + of indicating whether or not the same dimension + names that appear in different groups correspond + to each other, setting this parameter may be + necessary for the correct interpretation of the + dataset in the event that its dimensions are + named in a manner that is inconsistent with CF + rules defined by the CF conventions (section 2.7 + Groups). + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `None` """ _Flattener( input_ds, output_ds, strict, - omit_data=omit_data, - write_chunksize=write_chunksize, + copy_data=copy_data, + group_dimension_search=group_dimension_search, ).flatten() @@ -231,11 +288,13 @@ def generate_var_attr_str(d): class _Flattener: - """Information and methods needed to flatten a netCDF dataset. + """Information and methods needed to flatten a dataset. Contains the input file, the output file being flattened, and all the logic of the flattening process. + See `dataset_flatten` for detais. + .. versionadded:: (cfdm) 1.11.2.0 """ @@ -245,46 +304,126 @@ def __init__( input_ds, output_ds, strict=True, - omit_data=False, - write_chunksize=134217728, + copy_data=True, + group_dimension_search="closest_ancestor", ): """**Initialisation** :Parameters: input_ds: - The dataset to be flattened, that has the same API as - `netCDF4.Dataset` or `h5netcdf.File`. + The dataset to be flattened. Must be an object with + the same API as `netCDF4.Dataset` or + `h5netcdf.File`, or else a `zarr.Group` object. output_ds: `netCDF4.Dataset` A container for the flattened dataset. strict: `bool`, optional - See `netcdf_flatten`. + See `dataset_flatten`. - omit_data: `bool`, optional - See `netcdf_flatten`. + copy_data: `bool`, optional + See `dataset_flatten`. - write_chunksize: `int`, optional - See `netcdf_flatten`. + group_dimension_search: `str`, optional + See `dataset_flatten`. + + .. versionadded:: (cfdm) NEXTVERSION """ + # Mapping of flattened attribute names to their full-path + # counterparts. + # + # E.g. ['Conventions: /Conventions'] self._attr_map_value = [] + + # Mapping of flattened dimension names to their full-path + # counterparts + # + # E.g. ['bounds2: /bounds2', + # 'x: /x', + # 'forecast__y: /forecast/y'] self._dim_map_value = [] + + # Mapping of flattened variable names to their full-path + # counterparts + # + # E.g. ['x_bnds: /x_bnds', + # 'x: /x', + # 'b_bounds: /b_bounds', + # 'b: /b', + # 'latitude_longitude: /latitude_longitude', + # 'forecast__y: /forecast/y'] self._var_map_value = [] + # Mapping of full-path dimension names to their flattened + # counterparts + # + # E.g. {'/bounds2': 'bounds2', + # '/x': 'x', + # '/forecast/y': 'forecast__y'} self._dim_map = {} + + # Mapping of full-path variable names to their flattened + # counterparts + # + # E.g. {'/x_bnds': 'x_bnds', + # '/x': 'x', + # '/b_bounds': 'b_bounds', + # '/b': 'b', + # '/latitude_longitude': 'latitude_longitude', + # '/forecast/y': 'forecast__y'} self._var_map = {} + # Mapping of full-path group names to the dimensions defined + # therein + # + # E.g. {'/': {'feature': , + # 'station': }, + # '/forecast': {'element': }, + # '/forecast/model': {}} + # + # Currently this mapping is only required for an input + # `zarr.Group` dataset, and is generated by + # `_populate_dimension_maps`. + self._group_to_dims = {} + + # Mapping of variable names to their Dimension objects. + # + # E.g. {'x': (,), + # 'x_bnds': (, + # ), + # 'latitude_longitude': (), + # 'forecast/y': (>> f.filepath(dataset) + >>> f.dataset_name() '/home/data/file.nc' """ - try: - # netCDF4 - return dataset.filepath() - except AttributeError: - # h5netcdf - return dataset.filename + if dataset is None: + dataset = self._input_ds + + match self._backend(): + case "h5netcdf": + return dataset.filename + + case "netCDF4": + return dataset.filepath() + + case "zarr": + return str(dataset.store) - def get_dims(self, variable): - """Return the dimensions associated with a variable. + def _variable_dimensions(self, variable): + """Return the dimension objects associated with a variable. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + variable: + The variable object. + :Returns: - `list` + `list` of dimension objects """ - try: - # netCDF4 - return variable.get_dims() - except AttributeError: - # h5netcdf - dims = {} - dimension_names = list(variable.dimensions) - group = variable._parent - for name, dim in group.dims.items(): - if name in dimension_names: - dims[name] = dim - dimension_names.remove(name) - - group = group.parent - while group is not None and dimension_names: + match self._backend(): + case "netCDF4": + return variable.get_dims() + + case "h5netcdf": + dims = {} + dimension_names = list(variable.dimensions) + group = variable._parent for name, dim in group.dims.items(): if name in dimension_names: dims[name] = dim dimension_names.remove(name) group = group.parent + while group is not None and dimension_names: + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim + dimension_names.remove(name) - return [dims[name] for name in variable.dimensions] + group = group.parent + + return [dims[name] for name in variable.dimensions] + + case "zarr": + return self._var_to_dims[variable.name] def getncattr(self, x, attr): """Retrieve a netCDF attribute. @@ -526,31 +688,52 @@ def getncattr(self, x, attr): :Returns: """ - try: - # netCDF4 - return getattr(x, attr) - except AttributeError: - # h5netcdf - return x.attrs[attr] + match self._backend(): + case "h5netcdf" | "zarr": + return x.attrs[attr] + + case "netCDF4": + return getattr(x, attr) def group(self, x): - """Return the group that a variable belongs to. + """Return the group that a variable or dimension belongs to. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + x: + The variable or dimension object. + :Returns: `Group` """ - try: - # netCDF4 - return x.group() - except AttributeError: - # h5netcdf - return x._parent + match self._backend(): + case "netCDF4": + return x.group() + + case "h5netcdf": + return x._parent + + case "zarr": + try: + # Variable + group_name = group_separator.join( + x.path.split(group_separator)[:-1] + ) + g = self._input_ds.get(group_name) + if g is None: + # Must be the root group + g = self._input_ds + + return g + except AttributeError: + # Dimension + return x.group() - def name(self, x): + def name(self, x, dataset=None): """Return the netCDF name, without its groups. .. versionadded:: (cfdm) 1.11.2.0 @@ -560,68 +743,101 @@ def name(self, x): `str` """ - out = x.name - if group_separator in out: - # h5netcdf - out = x.name.split(group_separator)[-1] + match self._backend(dataset): + case "h5netcdf" | "netCDF4": + return x.name.split(group_separator)[-1] - return out + case "zarr": + try: + # Variable + return x.path.split(group_separator)[-1] + except AttributeError: + # Dimension + return x.name.split(group_separator)[-1] - def ncattrs(self, x): - """Return netCDF attribute names. + def _attribute_names(self, x): + """Return attribute names of a variable, group, or dataset. .. versionadded:: (cfdm) 1.11.2.0 :Parameters: - x: variable, group, or dataset + x: + The variable, group, or dataset object :Returns: `list` """ - try: - # netCDF4 - return x.ncattrs() - except AttributeError: - # h5netcdf - return list(x.attrs) + match self._backend(): + case "h5netcdf": + attrs = list(x.attrs) + + case "netCDF4": + attrs = x.ncattrs() + + case "zarr": + attrs = dict(x.attrs) + + # Remove _ARRAY_DIMENSIONS from Zarr v2 variable + # attributes + if x.metadata.zarr_format == 2 and hasattr(x, "shape"): + attrs.pop("_ARRAY_DIMENSIONS", None) + + attrs = list(attrs) + + return attrs def parent(self, group): - """Return a simulated unix parent group. + """Return the parent group. .. versionadded:: (cfdm) 1.11.2.0 :Returns: - `str` + `Group` or `None` + The parent group, or `None` if *group* is the root + group (and so has no parent). """ - try: - return group.parent - except AttributeError: - return + match self._backend(): + case "h5netcdf" | "netCDF4": + return group.parent + + case "zarr": + name = group.name + if name == group_separator: + return + + return self._input_ds[ + group_separator.join(name.split(group_separator)[:-1]) + ] def path(self, group): """Return a simulated unix directory path to a group. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + group: + The group object. + :Returns: `str` """ - try: - # netCDF4 - return group.path - except AttributeError: - # h5netcdf - try: - return group.name - except AttributeError: - return group_separator + match self._backend(): + case "h5netcdf" | "zarr": + try: + return group.name + except AttributeError: + return group_separator + + case "netCDF4": + return group.path def flatten(self): """Flattens and writes to output file. @@ -636,7 +852,10 @@ def flatten(self): input_ds = self._input_ds output_ds = self._output_ds - logging.info(f"Flattening the groups of {self.filepath(input_ds)}") + if self._debug: + logger.debug( + f"Flattening the groups of {self.dataset_name()}" + ) # pragma: no cover # Flatten product self.process_group(input_ds) @@ -647,10 +866,12 @@ def flatten(self): output_ds.setncattr(flattener_variable_map, self._var_map_value) # Browse flattened variables to rename references: - logging.info( - " Browsing flattened variables to rename references " - "in attributes" - ) + if self._debug: + logger.debug( + " Browsing flattened variables to rename references " + "in attributes" + ) # pragma: no cover + for var in output_ds.variables.values(): self.adapt_references(var) @@ -661,26 +882,29 @@ def process_group(self, input_group): :Parameters: - input_group: `str` - The group to flatten. + input_group: + The group object to flatten. :Returns: `None` """ - logging.info(f" Browsing group {self.path(input_group)}") + if self._debug: + logger.debug( + f" Browsing group {self.path(input_group)}" + ) # pragma: no cover - for attr_name in self.ncattrs(input_group): + for attr_name in self._attribute_names(input_group): self.flatten_attribute(input_group, attr_name) - for dim in input_group.dimensions.values(): + for dim in self._group_dimensions(input_group).values(): self.flatten_dimension(dim) - for var in input_group.variables.values(): + for var in self._group_variables(input_group).values(): self.flatten_variable(var) - for child_group in input_group.groups.values(): + for child_group in self._child_groups(input_group).values(): self.process_group(child_group) def flatten_attribute(self, input_group, attr_name): @@ -690,8 +914,8 @@ def flatten_attribute(self, input_group, attr_name): :Parameters: - input_group: `str` - The group containing the attribute to flatten. + input_group: + The group object containing the attribute to flatten. attr_name: `str` The name of the attribute. @@ -701,14 +925,15 @@ def flatten_attribute(self, input_group, attr_name): `None` """ - logging.info( - f" Copying attribute {attr_name} from " - f"group {self.path(input_group)} to root" - ) - # Create new name new_attr_name = self.generate_flattened_name(input_group, attr_name) + if self._debug: + logger.debug( + f" Creating global attribute {new_attr_name!r} from " + f"group {self.path(input_group)}" + ) # pragma: no cover + # Write attribute self._output_ds.setncattr( new_attr_name, self.getncattr(input_group, attr_name) @@ -727,23 +952,23 @@ def flatten_dimension(self, dim): :Parameters: dim: - The dimension to flatten, that has the same API as - `netCDF4.Dimension` or `h5netcdf.Dimension`. + The dimension object to flatten. :Returns: `None` """ - logging.info( - f" Copying dimension {self.name(dim)} from " - f"group {self.path(self.group(dim))} to root" - ) - # Create new name - new_name = self.generate_flattened_name( - self.group(dim), self.name(dim) - ) + group = self.group(dim) + name = self.name(dim) + new_name = self.generate_flattened_name(group, name) + + if self._debug: + logger.debug( + f" Creating dimension {new_name!r} from " + f"group {self.path(group)!r}" + ) # pragma: no cover # Write dimension self._output_ds.createDimension( @@ -751,15 +976,11 @@ def flatten_dimension(self, dim): ) # Store new name in dict for resolving references later - self._dim_map[self.pathname(self.group(dim), self.name(dim))] = ( - new_name - ) + self._dim_map[self.pathname(group, name)] = new_name # Add to name mapping attribute self._dim_map_value.append( - self.generate_mapping_str( - self.group(dim), self.name(dim), new_name - ) + self.generate_mapping_str(group, name, new_name) ) def flatten_variable(self, var): @@ -770,45 +991,42 @@ def flatten_variable(self, var): :Parameters: var: - The variable, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The variable object. :Returns: `None` """ - logging.info( - f" Copying variable {self.name(var)} from " - f"group {self.path(self.group(var))} to root" - ) - # Create new name new_name = self.generate_flattened_name( self.group(var), self.name(var) ) + if self._debug: + logger.debug( + f" Creating variable {new_name!r} from " + f"{self.pathname(self.group(var), self.name(var))!r}" + ) # pragma: no cover + # Replace old by new dimension names new_dims = list( map( lambda x: self._dim_map[ self.pathname(self.group(x), self.name(x)) ], - self.get_dims(var), + self._variable_dimensions(var), ) ) # Write variable - fullname = self.pathname(self.group(var), self.name(var)) - logging.info(f" Creating variable {new_name} from {fullname}") + attributes = self._variable_attrs(var) - attributes = self.attrs(var) - - omit_data = self._omit_data - if omit_data: - fill_value = False - else: + copy_data = self._copy_data + if copy_data: fill_value = attributes.pop("_FillValue", None) + else: + fill_value = False new_var = self._output_ds.createVariable( new_name, @@ -825,8 +1043,8 @@ def flatten_variable(self, var): fill_value=fill_value, ) - if not omit_data: - self.write_data_in_chunks(var, new_var) + if copy_data: + self.write_data(var, new_var) # Copy attributes new_var.setncatts(attributes) @@ -903,55 +1121,81 @@ def increment_pos(self, pos, dim, copy_slice_shape, var_shape): # increment. Finish. return False - def write_data_in_chunks(self, old_var, new_var): - """Copy the data of a variable to a new one by slice. + def write_data(self, old_var, new_var): + """Copy the data of a variable to the output dataset. .. versionadded:: (cfdm) 1.11.2.0 :Parameters: old_var: - The variable where the data should be copied from, - that has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The variable object where the data should be copied + from. new_var: - The new variable in which to copy the data, that has the - same API as `netCDF4.Variable` or `h5netcdf.Variable`. + The new variable object in which to copy the data. :Returns: `None` """ - ndim = old_var.ndim - shape = old_var.shape - chunk_shape = ( - (self.write_chunksize // (old_var.dtype.itemsize * ndim)), - ) * ndim - - logging.info( - f" Copying {self.name(old_var)!r} data in chunks of " - f"{chunk_shape}" - ) - # Initial position vector - pos = [0] * ndim - - # Copy in slices until end reached - var_end_reached = False - while not var_end_reached: - # Create current slice - current_slice = tuple( - slice(pos[dim_i], min(shape[dim_i], pos[dim_i] + dim_l)) - for dim_i, dim_l in enumerate(chunk_shape) - ) - - # Copy data in slice - new_var[current_slice] = old_var[current_slice] + import dask.array as da + import numpy as np + + from cfdm.data.locks import netcdf_lock + + # Need to convert a string-valued 'old_var' to a numpy array + if self.dtype(old_var) == str: + match self._backend(): + case "h5netcdf" | "netCDF4": + array = old_var[...] + + string_type = isinstance(array, str) + if string_type: + # A netCDF string type scalar variable comes + # out as Python str object, so convert it to a + # numpy array. + array = np.array(array, dtype=f"U{len(array)}") + + if not old_var.ndim: + # NetCDF4 has a thing for making scalar size 1 + # variables into 1d arrays + array = array.squeeze() + + if not string_type: + # An N-d (N>=1) netCDF string type variable + # comes out as a numpy object array, so + # convert it to numpy string array. + array = array.astype("U", copy=False) + # netCDF4 doesn't auto-mask VLEN variables + # array = np.ma.where(array == "", + # np.ma.masked, array) + array = np.ma.masked_values(array, "") + + old_var = array + + case "zarr": + array = old_var[...] + array = array.astype("O", copy=False).astype( + "U", copy=False + ) + fill_value = old_var.attrs.get( + "_FillValue", old_var.attrs.get("missing_value", "") + ) + array = np.where(array == "", fill_value, array) + old_var = array - # Get next position - var_end_reached = not self.increment_pos( - pos, 0, chunk_shape, shape + if isinstance(old_var, np.ndarray): + new_var[...] = old_var + else: + dx = da.from_array(old_var) + da.store( + dx, + new_var, + compute=True, + return_stored=False, + lock=netcdf_lock, ) def resolve_reference(self, orig_ref, orig_var, rules): @@ -968,9 +1212,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): The reference to resolve. orig_var: - The original variable containing the reference, that - has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object containing the reference. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1066,9 +1308,7 @@ def resolve_reference_proximity( and vice versa. orig_var: - The original variable containing the reference, that - has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object containing the reference. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1120,8 +1360,9 @@ def resolve_reference_proximity( ), ref_type, ) - else: - return None, "" + + # Unresolved + return None, "" def resolve_reference_post_processing( self, absolute_ref, orig_ref, orig_var, rules, ref_type, method @@ -1139,9 +1380,7 @@ def resolve_reference_post_processing( The original reference. orig_var: - The original variable containing the reference, that - has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object containing the reference. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1162,23 +1401,26 @@ def resolve_reference_post_processing( """ # If not found and accept standard name, assume standard name if absolute_ref is None and rules.accept_standard_names: - logging.info( - f" Reference to {orig_ref!r} not " - "resolved. Assumed to be a standard name." - ) + if self._debug: + logger.debug( + f" Reference to {orig_ref!r} not " + "resolved. Assumed to be a standard name." + ) # pragma: no cover + ref_type = "standard_name" absolute_ref = orig_ref elif absolute_ref is None: # Not found, so raise exception. absolute_ref = self.handle_reference_error( - orig_ref, self.path(self.group(orig_var)) + rules.name, orig_ref, self.path(self.group(orig_var)) ) else: # Found - logging.info( - f" {method} reference to {ref_type} " - f"{orig_ref!r} resolved as {absolute_ref!r}" - ) + if self._debug: + logger.debug( + f" {method} reference to {ref_type} " + f"{orig_ref!r} resolved as {absolute_ref!r}" + ) # pragma: no cover # If variables refs are limited to coordinate variable, # additional check @@ -1187,16 +1429,18 @@ def resolve_reference_post_processing( and rules.limit_to_scalar_coordinates and ( ( - "coordinates" not in self.ncattrs(orig_var) + "coordinates" not in self._attribute_names(orig_var) or orig_ref not in self.getncattr(orig_var, "coordinates") ) or self._input_ds[absolute_ref].ndim > 0 ) ): - logging.info( - f" Reference to {orig_ref!r} is not a " - "scalar coordinate variable. Assumed to be a standard name." - ) + if self._debug: + logger.debug( + f" Reference to {orig_ref!r} is not a scalar " + "coordinate variable. Assumed to be a standard name." + ) # pragma: no cover + absolute_ref = orig_ref # Return result @@ -1215,8 +1459,8 @@ def search_by_relative_path(self, ref, current_group, search_dim): ref: `str` The reference to resolve. - current_group: `str` - The current group of the reference. + current_group: + The current group object of the reference. search_dim: `bool` If True then search for a dimension, otherwise a @@ -1229,26 +1473,28 @@ def search_by_relative_path(self, ref, current_group, search_dim): """ # Go up parent groups - while ref.startswith("../"): - if current_group.parent is None: - return None + while ref.startswith(f"..{group_separator}"): + parent = self.parent(current_group) + if parent is None: + return ref = ref[3:] - current_group = current_group.parent + current_group = parent # Go down child groups ref_split = ref.split(group_separator) for g in ref_split[:-1]: try: - current_group = current_group.groups[g] + current_group = self._child_groups(current_group)[g] except KeyError: - return None + return # Get variable or dimension if search_dim: - elt = current_group.dimensions[ref_split[-1]] + elt = tuple(self._group_dimensions(current_group))[ref_split[-1]] + else: - elt = current_group.variables[ref_split[-1]] + elt = tuple(self._group_variables(current_group))[ref_split[-1]] # Get absolute reference return self.pathname(self.group(elt), self.name(elt)) @@ -1278,7 +1524,7 @@ def search_by_proximity( The reference to resolve. current_group: - The current group where searching. + The current group object where searching. search_dim: `bool` If True then search for a dimension, otherwise a @@ -1298,31 +1544,32 @@ def search_by_proximity( """ if search_dim: - dims_or_vars = current_group.dimensions + dims_or_vars = self._group_dimensions(current_group) else: - dims_or_vars = current_group.variables + dims_or_vars = self._group_variables(current_group) # Found in current group - if ref in dims_or_vars.keys(): + if ref in dims_or_vars: return dims_or_vars[ref] local_apex_reached = ( - local_apex_reached or ref in current_group.dimensions.keys() + local_apex_reached or ref in self._group_dimensions(current_group) ) # Check if have to continue looking in parent group # - normal search: continue until root is reached # - coordinate variable: continue until local apex is reached + parent_group = self.parent(current_group) if is_coordinate_variable: - top_reached = local_apex_reached or current_group.parent is None + top_reached = local_apex_reached or parent_group is None else: - top_reached = current_group.parent is None + top_reached = parent_group is None # Search up if not top_reached: return self.search_by_proximity( ref, - current_group.parent, + parent_group, search_dim, local_apex_reached, is_coordinate_variable, @@ -1330,9 +1577,9 @@ def search_by_proximity( elif is_coordinate_variable and local_apex_reached: # Coordinate variable and local apex reached, so search - # down in siblings + # down in siblings. found_elt = None - for child_group in current_group.groups.values(): + for child_group in self._child_groups(current_group).values(): found_elt = self.search_by_proximity( ref, child_group, @@ -1345,9 +1592,8 @@ def search_by_proximity( return found_elt - else: - # Did not find - return None + # Did not find + return def resolve_references(self, var, old_var): """Resolve references. @@ -1360,21 +1606,18 @@ def resolve_references(self, var, old_var): :Parameters: var: - The flattened variable in which references should be - renamed with absolute references, that has the same - API as `netCDF4.Variable` or `h5netcdf.Variable`. + The flattened variable object in which references + should be renamed with absolute references. old_var: - The original variable (in group structure), that has - the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object (in group structure). :Returns: `None` """ - var_attrs = self.attrs(var) + var_attrs = self._variable_attrs(var, "output") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value parsed_attribute = parse_attribute(name, var_attrs[name]) @@ -1413,16 +1656,15 @@ def adapt_references(self, var): :Parameters: var: - The flattened variable in which references should be - renamed with new names, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The flattened variable object in which references + should be renamed with new names. :Returns: `None` """ - var_attrs = self.attrs(var) + var_attrs = self._variable_attrs(var, "output") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value value = var_attrs[name] @@ -1446,10 +1688,12 @@ def adapt_references(self, var): new_attr_value = generate_var_attr_str(adapted_parsed_attr) var.setncattr(name, new_attr_value) - logging.info( - f" Value of {self.name(var)}.{name} changed " - f"from {value!r} to {new_attr_value!r}" - ) + if self._debug: + logger.debug( + " Value of attribute " + f"{self.name(var, 'output')}.{name} " + f"changed from {value!r} to {new_attr_value!r}" + ) # pragma: no cover def adapt_name(self, resolved_ref, rules): """Apapt the name. @@ -1459,6 +1703,8 @@ def adapt_name(self, resolved_ref, rules): .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + resolved_ref: `str` The resolved reference. @@ -1508,9 +1754,9 @@ def adapt_name(self, resolved_ref, rules): else: # If not found, raise exception - return self.handle_reference_error(resolved_ref) + return self.handle_reference_error(rules.name, resolved_ref) - def pathname(self, group, name): + def pathname(self, group, name=None): """Compose full path name to an element in a group structure. .. versionadded:: (cfdm) 1.11.2.0 @@ -1518,7 +1764,7 @@ def pathname(self, group, name): :Parameters: current_group: - The group containing the dimension or variable. + The group object containing the dimension or variable. name: `str` The name of the dimension or variable. @@ -1545,8 +1791,8 @@ def generate_mapping_str(self, input_group, name, new_name): :Parameters: input_group: - The group containing the non-flattened dimension or - variable. + The group object containing the non-flattened + dimension or variable. name: `str` The name of the non-flattened dimension or variable. @@ -1604,7 +1850,7 @@ def generate_flattened_name(self, input_group, orig_name): :Parameters: input_group: - The group containing the dimension or variable. + The group object containing the dimension or variable. orig_name: `str` The original name of the dimension or variable. @@ -1646,7 +1892,7 @@ def generate_flattened_name(self, input_group, orig_name): return new_name - def handle_reference_error(self, ref, context=None): + def handle_reference_error(self, role, ref, context=None): """Handle reference error. Depending on the `_strict` mode, either raise an exception or @@ -1657,8 +1903,12 @@ def handle_reference_error(self, ref, context=None): :Parameters: + role: `str` + The CF role of the reference, + e.g. ``'instance_dimension'``, ``'cell_measures'``. + ref: `str` - The reference + The reference. context: `str` Additional context information to add to message. @@ -1670,7 +1920,7 @@ def handle_reference_error(self, ref, context=None): `UnresolvedReferenceException` is raised. """ - message = f"Reference {ref!r} could not be resolved" + message = f"{role} reference {ref!r} could not be resolved" if context is not None: message = f"{message} from {context}" @@ -1680,6 +1930,451 @@ def handle_reference_error(self, ref, context=None): warnings.warn(message) return f"{ref_not_found_error}_{ref}" + def _group_dimensions(self, group): + """Return dimensions that are defined in a group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group object. + + :Returns: + + `dict`-like + The dimensions defined in the group, keyed by their + names. + + """ + match self._backend(): + case "h5netcdf" | "netCDF4": + if self._group_dimension_search != "closest_ancestor": + raise ValueError( + f"For netCDF dataset {self.dataset_name()}, " + "the group_dimension_search keyword must be " + "'closest_ancestor'. " + f"Got {self._group_dimension_search!r}" + ) + + return group.dimensions + + case "zarr": + group_name = self.path(group) + if not self._group_to_dims and group_name == group_separator: + # Populate the `_group_to_dims` and `_var_to_dims` + # dictionaries if we're at the root group + self._populate_dimension_maps(group) + + return self._group_to_dims[group_name] + + def _group_variables(self, group): + """Return variables that are defined in a group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group object. + + :Returns: + + `dict`-like + The variables, keyed by their names. + + """ + match self._backend(): + case "h5netcdf" | "netCDF4": + return group.variables + + case "zarr": + return dict(group.arrays()) + + def _populate_dimension_maps(self, group): + """Populate the dimension map dictionaries. + + For the given group and all of its child groups, a mapping of + full-path group names to the unique dimensions implied by the + variables therein will be added to `_group_to_dims`. For + instance:: + + {'/': {}, + 'bounds2': , + 'x': }, + '/forecast': {'y': }, + '/forecast/model': {}} + + + For the given group and all of its child groups, a mapping of + full-path variables names to their dimensions will be added to + `_var_to_dims`. For instance:: + + {'/latitude_longitude': (), + '/x': (,), + '/x_bnds': ( + ), + '/forecast/cell_measure': (, + ), + '/forecast/latitude': (, + ), + '/forecast/longitude': (, + ), + '/forecast/rotated_latitude_longitude': (), + '/forecast/time': (), + '/forecast/y': (,), + '/forecast/y_bnds': (, + ), + '/forecast/model/ta': (, + )} + + **Zarr datasets** + + Populating the `_group_to_dims` dictionary is currently only + required for a Zarr grouped dataset, for which this + information is not explicitly defined in the format's data + model (unlike for netCDF and HDF5 datasets). + + See `dataset_flatten` for details. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group object. + + :Returns: + + `None` + + """ + from ..zarr import ZarrDimension + + group_name = self.path(group) + + input_ds = self._input_ds + group_to_dims = self._group_to_dims + var_to_dims = self._var_to_dims + group_dimension_search = self._group_dimension_search + + # Initialise the mapping from this group to its ZarrDimension + # objects. Use 'setdefault' because a previous call to + # `_populate_dimension_maps` might already have done this. + group_to_dims.setdefault(group_name, {}) + + # Loop over variables in this group, sorted by variable name. + for v in dict(sorted(group.arrays())).values(): + # Initialise mapping from the variable to its + # ZarrDimension objects + var_name = v.name + var_to_dims[var_name] = () + + dimension_names = self._variable_dimension_names(v) + if not dimension_names: + # A scalar variable has no dimensions + continue + + # Loop over this variable's dimension names + for name, size in zip(dimension_names, v.shape): + name_split = name.split(group_separator) + basename = name_split[-1] + + # ---------------------------------------------------- + # Define 'g' as the absolute path name of the group in + # which to register the logical dimension object for + # this dimension. + # + # Which group is defined will depend on the nature of + # the dimension's 'name'. + # ---------------------------------------------------- + if group_separator not in name: + # ------------------------------------------------ + # Relative path dimension name which contains no + # '/' characters. The behaviour depends on the + # search algorithm defined by + # 'group_dimension_search'. + # + # E.g. "dim" + # ------------------------------------------------ + if group_dimension_search in ( + "closest_ancestor", + "furthest_ancestor", + ): + # Find the names of all ancestor groups, in + # the appropriate order for searching. + group_split = group_name.split(group_separator) + ancestor_names = [ + group_separator.join(group_split[:n]) + for n in range(1, len(group_split)) + ] + ancestor_names[0] = group_separator + # E.g. if the current group is /g1/g2/g3 then + # the ancestor group names are [/, /g1, + # /g1/g2] + + if group_dimension_search == "closest_ancestor": + # "closest_ancestor" searching requires + # the ancestor group order to be reversed, + # e.g. [/g1/g2, /g1, /] + ancestor_names = ancestor_names[::-1] + + # Search through the ancestors in order, + # stopping if we find a matching dimension. + found_dim_in_ancestor = False + for g in ancestor_names: + zarr_dim = group_to_dims[g].get(basename) + if zarr_dim is not None and zarr_dim.size == size: + # Found a dimension in this ancestor + # group 'g' with the right name and + # size + found_dim_in_ancestor = True + break + + if not found_dim_in_ancestor: + # Dimension 'basename' could not be + # matched to any ancestor group + # dimensions, so define it in the current + # group. + g = group_name + + elif group_dimension_search == "local": + # Assume that the dimension is different to + # any with same name and size defined in any + # ancestor group. + g = group_name + + else: + raise DimensionParsingException( + "Bad 'group_dimension_search' value: " + f"{group_dimension_search!r}" + ) + else: + g = group_separator.join(name_split[:-1]) + if name.endswith(group_separator): + # -------------------------------------------- + # Dimension name that ends with '/' + # + # E.g. "dim/" + # E.g. "group1/dim/" + # -------------------------------------------- + raise DimensionParsingException( + "Dimension names can't end with the group " + f"separator ({group_separator}): " + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" + ) + + elif f"{group_separator}..{group_separator}" in name: + # -------------------------------------------- + # Relative path dimension name with upward + # path traversals ('../') *not* at the start + # of the name. + # + # E.g. "/group1/../group2/dim" + # E.g. "group1/../group2/dim" + # E.g. "../group1/../group2/dim" + # + # Note that "../../dim" is not such a case. + # -------------------------------------------- + raise DimensionParsingException( + "In Zarr datasets, can't yet deal with a " + "relative path dimension name with upward path " + "traversals (../) in middle of the name: " + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" + "\n\n" + "Please raise an issue at " + "https://github.com/NCAS-CMS/cfdm/issues " + "if you would like this feature." + ) + + elif name.startswith(f"..{group_separator}"): + # -------------------------------------------- + # Relative path dimension name with upward + # path traversals ('../') at the start of the + # name + # + # E.g. "../group1/dim" + # E.g. "../../group1/dim" + # E.g. "../../dim" + # -------------------------------------------- + current_group = group + while g.startswith(f"..{group_separator}"): + parent_group = self.parent(current_group) + current_group = parent_group + g = g[3:] + if parent_group is None and g.startswith( + f"..{group_separator}" + ): + # We're about to go beyond the root + # group! + raise DimensionParsingException( + "Upward path traversals in Zarr dimension " + "name go beyond the root group: " + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" + ) + + g = group_separator.join((self.path(current_group), g)) + + elif name.startswith(group_separator): + # -------------------------------------------- + # Absolute path dimension name that starts + # with '/', and contains no upward path + # traversals ('../'). + # + # E.g. "/dim" + # E.g. "/group1/dim" + # -------------------------------------------- + if g == "": + g = group_separator + + else: + # -------------------------------------------- + # Relative path dimension name which contains + # '/' and which contains no upward path + # traversals ('../'). + # + # E.g. "group1/dim" + # -------------------------------------------- + g = group_separator.join((group_name, g)) + + zarr_dim = None + if g in group_to_dims: + # Group 'g' is already registered in the mapping + zarr_dim = group_to_dims[g].get(basename) + if zarr_dim is not None: + # Dimension 'basename' is already registered + # in group 'g' + if zarr_dim.size != size: + raise DimensionParsingException( + f"Zarr Dimension has the wrong size: {size}. " + f"Expected size {zarr_dim.size} " + "(defined by variable " + f"{zarr_dim.reference_variable().name}). " + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" + ) + else: + # Initialise group 'g' in the mapping + group_to_dims[g] = {} + + if zarr_dim is None: + # Register a new ZarrDimension in a group + defining_group = input_ds.get(g) + if defining_group is None: + # Must be the root group + defining_group = input_ds + + zarr_dim = ZarrDimension(basename, size, defining_group, v) + group_to_dims[g][basename] = zarr_dim + + # Map the variable to the ZarrDimension object + var_to_dims[var_name] += (zarr_dim,) + + # Recursively scan all child groups + for g in group.group_values(): + self._populate_dimension_maps(g) + + def _variable_dimension_names(self, var): + """Return the dimension names for a variable. + + Currently this is only required for, and only works for, Zarr + variables. An `AttributeError` will be raised if called for + any other type of variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: + The variable object. + + :Returns: + + `list` of `str` + The variable's dimension names. A scalar variable will + have an empty list. + + """ + zarr_format = var.metadata.zarr_format + match zarr_format: + case 3: + dimensions = var.metadata.dimension_names + case 2: + dimensions = var.metadata.attrs.get("_ARRAY_DIMENSIONS") + case _: + raise DimensionParsingException( + f"Can't flatten a Zarr v{zarr_format} dataset. " + "Only Zarr v3 and v2 can be flattened" + ) + + if dimensions is None: + if var.shape: + raise DimensionParsingException( + f"Non-scalar Zarr v{zarr_format} variable has no " + f"dimension names: {var.name}" + ) + + dimensions = [] + + return dimensions + + def _child_groups(self, group): + """Return groups that are defined in this group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group object. + + :Returns: + + `dict`-like + The groups, keyed by their names. + + """ + match self._backend(): + case "h5netcdf" | "netCDF4": + return group.groups + + case "zarr": + return dict(group.groups()) + + def _backend(self, dataset=None): + """Return the name of the backend that defines a dataset. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `str` or `None` + If set to ``'output'`` then the name of the output + dataset backend will be returned. If `None` (the + default) then the name of backend that defines the + input dataset is returned. + + :Returns: + + `str` + The backend name. + + """ + if dataset is None: + return self._input_ds_backend + + if dataset == "output": + return "netCDF4" + + raise ("Bad value of 'dataset'") + class AttributeParsingException(Exception): """Exception for unparsable attribute. @@ -1691,6 +2386,16 @@ class AttributeParsingException(Exception): pass +class DimensionParsingException(Exception): + """Exception for unparsable dimension. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + pass + + class UnresolvedReferenceException(Exception): """Exception for unresolvable references in attributes. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 6942847f2..602205690 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -26,7 +26,7 @@ NETCDF_MAGIC_NUMBERS, NETCDF_QUANTIZATION_PARAMETERS, ) -from .flatten import netcdf_flatten +from .flatten import dataset_flatten from .flatten.config import ( flattener_attribute_map, flattener_dimension_map, @@ -138,7 +138,7 @@ class NetCDFRead(IORead): "is not referenced in file": 9, "exists in the file": 10, "does not exist in file": 11, - "exists in multiple external files": 12, + "exists in multiple external datasets": 12, "has incorrect size": 13, "is missing": 14, "is not used by data variable": 15, @@ -445,11 +445,11 @@ def _reference(self, ncvar, referencing_ncvar): return count - def file_close(self): - """Close all netCDF files that have been opened. + def dataset_close(self): + """Close all netCDF datasets that have been opened. - Includes the input file being read, any external files, and any - temporary flattened files. + Includes the input dataset being read, any external datasets, + and any temporary flattened dataset. :Returns: @@ -457,33 +457,34 @@ def file_close(self): **Examples** - >>> r.file_close() + >>> r.dataset_close() """ g = self.read_vars - # Close temporary flattened files - for flat_file in g["flat_files"]: - flat_file.close() + # Close temporary flattened datasets + for flat_dataset in g["flat_datasets"]: + flat_dataset.close() - if g["file_opened_with"] == "zarr": - # zarr - return - - # netCDF4, h5netcdf for nc in g["datasets"]: - nc.close() + try: + nc.close() + except AttributeError: + pass # Close the original grouped file (v1.8.8.1) if "nc_grouped" in g: - g["nc_grouped"].close() + try: + g["nc_grouped"].close() + except AttributeError: + pass # Close s3fs.File objects for f in g["s3fs_File_objects"]: f.close() - def file_open(self, dataset, flatten=True, verbose=None): - """Open the netCDF file for reading. + def dataset_open(self, dataset, flatten=True, verbose=None): + """Open the netCDF dataset for reading. If the file has hierarchical groups then a flattened version of it is returned, and the original grouped file remains open. @@ -507,7 +508,7 @@ def file_open(self, dataset, flatten=True, verbose=None): **Examples** - >>> r.file_open('file.nc') + >>> r.dataset_open('file.nc') """ from uritools import urisplit @@ -560,7 +561,7 @@ def file_open(self, dataset, flatten=True, verbose=None): ) # pragma: no cover # Map backend names to file-open functions - file_open_function = { + dataset_open_function = { "h5netcdf": self._open_h5netcdf, "netCDF4": self._open_netCDF4, "zarr": self._open_zarr, @@ -572,7 +573,7 @@ def file_open(self, dataset, flatten=True, verbose=None): errors = [] for backend in netcdf_backend: try: - nc = file_open_function[backend](dataset) + nc = dataset_open_function[backend](dataset) except KeyError: errors.append(f"{backend}: Unknown netCDF backend name") except Exception as error: @@ -601,7 +602,7 @@ def file_open(self, dataset, flatten=True, verbose=None): # Create a diskless, non-persistent container for the # flattened file - flat_file = tempfile.NamedTemporaryFile( + flat_dataset = tempfile.NamedTemporaryFile( mode="wb", dir=tempfile.gettempdir(), prefix="cfdm_flat_", @@ -610,12 +611,18 @@ def file_open(self, dataset, flatten=True, verbose=None): ) flat_nc = netCDF4.Dataset( - flat_file, "w", diskless=True, persist=False + flat_dataset, "w", diskless=True, persist=False ) flat_nc.set_fill_off() # Flatten the file - netcdf_flatten(nc, flat_nc, strict=False, omit_data=True) + dataset_flatten( + nc, + flat_nc, + strict=False, + copy_data=False, + group_dimension_search=g["group_dimension_search"], + ) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened @@ -626,10 +633,10 @@ def file_open(self, dataset, flatten=True, verbose=None): nc = flat_nc g["has_groups"] = True - g["flat_files"].append(flat_file) + g["flat_datasets"].append(flat_dataset) g["nc_opened_with"] = "netCDF4" else: - g["nc_opened_with"] = g["file_opened_with"] + g["nc_opened_with"] = g["original_dataset_opened_with"] g["nc"] = nc return nc @@ -652,7 +659,7 @@ def _open_netCDF4(self, filename): import netCDF4 nc = netCDF4.Dataset(filename, "r") - self.read_vars["file_opened_with"] = "netCDF4" + self.read_vars["original_dataset_opened_with"] = "netCDF4" return nc def _open_h5netcdf(self, filename): @@ -686,7 +693,7 @@ def _open_h5netcdf(self, filename): rdcc_w0=0.75, rdcc_nslots=4133, ) - self.read_vars["file_opened_with"] = "h5netcdf" + self.read_vars["original_dataset_opened_with"] = "h5netcdf" return nc def _open_zarr(self, dataset): @@ -707,11 +714,14 @@ def _open_zarr(self, dataset): try: import zarr except ModuleNotFoundError as error: - error.msg += ". Install the 'zarr' package to read Zarr datasets" + error.msg += ( + ". Install the 'zarr' package " + "(https://pypi.org/project/zarr) to read Zarr datasets" + ) raise - nc = zarr.open(dataset) - self.read_vars["file_opened_with"] = "zarr" + nc = zarr.open(dataset, mode="r") + self.read_vars["original_dataset_opened_with"] = "zarr" return nc def cdl_to_netcdf(self, filename): @@ -942,6 +952,7 @@ def read( cache=True, dask_chunks="storage-aligned", store_dataset_chunks=True, + store_dataset_shards=True, cfa=None, cfa_write=None, to_memory=None, @@ -950,8 +961,9 @@ def read( dataset_type=None, cdl_string=False, ignore_unknown_type=False, + group_dimension_search="closest_ancestor", ): - """Reads a netCDF dataset from file or OPenDAP URL. + """Reads a netCDF or Zarr dataset from file or OPenDAP URL. Read fields from a netCDF file on disk or from an OPeNDAP server location. @@ -1018,7 +1030,7 @@ def read( dask_chunks: `str`, `int`, `None`, or `dict`, optional Specify the `dask` chunking of dimensions for data in - the input files. See `cfdm.read` for details. + the input datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.11.2.0 @@ -1028,20 +1040,26 @@ def read( .. versionadded:: (cfdm) 1.12.0.0 + store_dataset_shards: `bool`, optional + Store the dataset sharding strategy. See `cfdm.read` + for details. + + .. versionadded:: (cfdm) NEXTVERSION + cfa: `dict`, optional - Configure the reading of CF-netCDF aggregation files. - See `cfdm.read` for details. + Configure the reading of CF-netCDF aggregation + datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 cfa_write: sequence of `str`, optional - Configure the reading of CF-netCDF aggregation files. - See `cfdm.read` for details. + Configure the reading of CF-netCDF aggregation + datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 to_memory: (sequence) of `str`, optional - Whether or not to bring data arrays into memory. See + Whether or not to bring data arrays into memory. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 @@ -1060,8 +1078,8 @@ def read( .. versionadded:: (cfdm) 1.12.0.0 dataset_type: `None` or (sequence of) `str`, optional - Only read files of the given type(s). See `cfdm.read` - for details. + Only read datasets of the given type(s). See + `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 @@ -1077,6 +1095,12 @@ def read( .. versionadded:: (cfdm) 1.11.2.0 + group_dimension_search: `str`, optional + How to interpret a group dimension name that has no + path. See `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `list` @@ -1135,7 +1159,7 @@ def read( # valid. # # Note that the `dataset_type` method is much faster than the - # `file_open` method at returning for unrecognised types. + # `dataset_open` method at returning for unrecognised types. # ------------------------------------------------------------ d_type = self.dataset_type(dataset, dataset_type) if not d_type: @@ -1386,8 +1410,9 @@ def read( # Assume a priori that the dataset does not have a group # structure "has_groups": False, - # Keep a list of flattened file names - "flat_files": [], + "group_dimension_search": group_dimension_search, + # Keep a list of flattened dataset names + "flat_datasets": [], # -------------------------------------------------------- # Domains (CF>=1.9) # -------------------------------------------------------- @@ -1446,9 +1471,11 @@ def read( # Dask chunking of aggregated data for selected constructs "cfa_write": cfa_write, # -------------------------------------------------------- - # Whether or not to store the dataset chunking strategy + # Whether or not to store the dataset chunking and + # sharding strategies # -------------------------------------------------------- "store_dataset_chunks": bool(store_dataset_chunks), + "store_dataset_shards": bool(store_dataset_shards), # -------------------------------------------------------- # Constructs to read into memory # -------------------------------------------------------- @@ -1484,10 +1511,10 @@ def read( g.update(deepcopy(extra_read_vars)) # ------------------------------------------------------------ - # Open the netCDF file to be read + # Open the netCDF dataset to be read # ------------------------------------------------------------ try: - nc = self.file_open(dataset, flatten=True, verbose=None) + nc = self.dataset_open(dataset, flatten=True, verbose=None) except DatasetTypeError: if not g["ignore_unknown_type"]: raise @@ -1586,7 +1613,7 @@ def read( dimension_isunlimited = {} # ------------------------------------------------------------ - # For grouped files (CF>=1.8) map: + # For grouped datasets (CF>=1.8) map: # # * each flattened variable name to its absolute path # * each flattened dimension name to its absolute path @@ -1812,10 +1839,10 @@ def read( g["variables"] = variables # The netCDF4 dataset objects that have been opened (i.e. the - # for parent file and any external files) + # for parent file and any external datasets) g["datasets"] = [nc] - # The names of the variable in the parent files + # The names of the variable in the parent datasetss # (i.e. excluding any external variables) g["internal_variables"] = set(variables) @@ -2161,7 +2188,7 @@ def read( if g["CF>=1.7"]: logger.info( f" External variables: {g['external_variables']}\n" - f" External files : {g['external_files']}" + f" External datasets : {g['external_files']}" ) # pragma: no cover if g["external_files"] and g["external_variables"]: @@ -2347,9 +2374,9 @@ def read( self._check_valid(f, c) # ------------------------------------------------------------ - # Close all opened netCDF files + # Close all opened netCDF datasets # ------------------------------------------------------------ - self.file_close() + self.dataset_close() # ------------------------------------------------------------ # Squeeze/unsqueeze size 1 axes in field constructs @@ -2552,7 +2579,7 @@ def _customise_read_vars(self): pass def _get_variables_from_external_files(self, netcdf_external_variables): - """Get external variables from external files. + """Get external variables from external datasets. ..versionadded:: (cfdm) 1.7.0 @@ -2598,7 +2625,7 @@ def _get_variables_from_external_files(self, netcdf_external_variables): for external_file in external_files: logger.info( - "\nScanning external file:\n-----------------------" + "\nScanning external datasets:\n---------------------------" ) # pragma: no cover # Note: We pass in the s3 file system (if any) of the @@ -2638,7 +2665,7 @@ def _get_variables_from_external_files(self, netcdf_external_variables): ncvar, message=( "External variable", - "exists in multiple external files", + "exists in multiple external datasets", ), attribute=attribute, ) @@ -6652,7 +6679,8 @@ def _create_netcdfarray( group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - variable = group.variables.get(name) + variable = self._file_group_variables(group).get(name) + else: variable = g["variables"].get(ncvar) @@ -6671,7 +6699,7 @@ def _create_netcdfarray( ndim = variable.ndim shape = variable.shape - size = self._file_variable_size(variable) + size = prod(shape) if size < 2: size = int(size) @@ -6718,13 +6746,17 @@ def _create_netcdfarray( if return_kwargs_only: return kwargs - file_opened_with = g["file_opened_with"] - if file_opened_with == "netCDF4": - array = self.implementation.initialise_NetCDF4Array(**kwargs) - elif file_opened_with == "h5netcdf": - array = self.implementation.initialise_H5netcdfArray(**kwargs) - elif file_opened_with == "zarr": - array = self.implementation.initialise_ZarrArray(**kwargs) + match g["original_dataset_opened_with"]: + case "netCDF4": + array = self.implementation.initialise_NetCDF4Array( + **kwargs + ) + case "h5netcdf": + array = self.implementation.initialise_H5netcdfArray( + **kwargs + ) + case "zarr": + array = self.implementation.initialise_ZarrArray(**kwargs) return array, kwargs @@ -7092,7 +7124,7 @@ def _create_data( # # b) Cached values are never really required for # compression index data. - self._cache_data_elements(data, ncvar) + self._cache_data_elements(data, ncvar, attributes) # ------------------------------------------------------------ # Set data aggregation parameters @@ -8181,36 +8213,49 @@ def _create_Data( `Data` """ - if array.dtype is None: - g = self.read_vars - if g["has_groups"]: - group, name = self._netCDF4_group( - g["variable_grouped_dataset"][ncvar], ncvar - ) - variable = group.variables.get(name) - else: - variable = g["variables"].get(ncvar) - - array = variable[...] - - string_type = isinstance(array, str) - if string_type: - # A netCDF string type scalar variable comes out as Python - # str object, so convert it to a numpy array. - array = np.array(array, dtype=f"U{len(array)}") - - if not variable.ndim: - # NetCDF4 has a thing for making scalar size 1 - # variables into 1d arrays - array = array.squeeze() + g = self.read_vars - if not string_type: - # An N-d (N>=1) netCDF string type variable comes out - # as a numpy object array, so convert it to numpy - # string array. - array = array.astype("U", copy=False) - # NetCDF4 doesn't auto-mask VLEN variables - array = np.ma.where(array == "", np.ma.masked, array) + # Deal with strings + match g["original_dataset_opened_with"]: + case "h5netcdf" | "netCDF4": + if array.dtype is None: + if g["has_groups"]: + group, name = self._netCDF4_group( + g["variable_grouped_dataset"][ncvar], ncvar + ) + variable = group.variables.get(name) + else: + variable = g["variables"].get(ncvar) + + array = variable[...] + + string_type = isinstance(array, str) + if string_type: + # A netCDF string type scalar variable comes + # out as Python str object, so convert it to a + # numpy array. + array = np.array(array, dtype=f"U{len(array)}") + + if not variable.ndim: + # NetCDF4 has a thing for making scalar size 1 + # variables into 1d arrays + array = array.squeeze() + + if not string_type: + # An N-d (N>=1) netCDF string type variable + # comes out as a numpy object array, so + # convert it to numpy string array. + array = array.astype("U", copy=False) + # netCDF4 doesn't auto-mask VLEN variables + # array = np.ma.where(array == "", np.ma.masked, array) + array = np.ma.masked_values(array, "") + + case "zarr": + if array.dtype == np.dtypes.StringDType(): + array = array.astype("O", copy=False).astype( + "U", copy=False + ) + array = np.ma.masked_values(array, "") # Set the dask chunking strategy chunks = self._dask_chunks( @@ -8218,7 +8263,7 @@ def _create_Data( ) # Set whether or not to read the data into memory - to_memory = self.read_vars["to_memory"] + to_memory = g["to_memory"] to_memory = "all" in to_memory or construct_type in to_memory data = self.implementation.initialise_Data( @@ -8231,15 +8276,26 @@ def _create_Data( **kwargs, ) - # Store the dataset chunking - if self.read_vars["store_dataset_chunks"] and ncvar is not None: - # Only store the dataset chunking if 'data' has the same - # shape as its netCDF variable. This may not be the case - # for variables compressed by convention (e.g. some DSG - # variables). - chunks, shape = self._get_dataset_chunks(ncvar) - if shape == data.shape: - self.implementation.nc_set_dataset_chunksizes(data, chunks) + if ncvar is not None: + # Store the dataset chunking + if g["store_dataset_chunks"]: + # Only store the dataset chunking if 'data' has the + # same shape as its netCDF variable. This may not be + # the case for variables compressed by convention + # (e.g. some DSG variables). + chunks, shape = self._get_dataset_chunks(ncvar) + if shape == data.shape: + self.implementation.nc_set_dataset_chunksizes(data, chunks) + + # Store the dataset sharding + if g["store_dataset_shards"]: + # Only store the dataset sharding if 'data' has the + # same shape as its netCDF variable. This may not be + # the case for variables compressed by convention + # (e.g. some DSG variables). + shards, shape = self._get_dataset_shards(ncvar) + if shards is not None and shape == data.shape: + self.implementation.nc_set_dataset_shards(data, shards) return data @@ -9563,22 +9619,19 @@ def _netCDF4_group(self, nc, name): :Parameters: - nc: `netCDF4._netCDF4.Dataset` or `netCDF4._netCDF4.Group` + nc: `netCDF4.Dataset` or `h5netcdf.Group` or `zarr.Group` name: `str` :Returns: - `netCDF4._netCDF4.Dataset` or `netCDF4._netCDF4.Group`, `str` + 2-`tuple`: + The group object, and the relative-path variable name. **Examples** - >>> group, name = n._netCDF4_group(nc, 'time') - >>> group.name, name - ('/', 'time') - >>> group, name = n._netCDF4_group(nc, '/surfacelayer/Z') - >>> group.name, name - ('surfacelayer', 'Z') + >>> n._netCDF4_group(nc, '/forecast/count') + (, 'count') """ group = nc @@ -10863,9 +10916,6 @@ def _ugrid_check_connectivity_variable( def _dataset_has_groups(self, nc): """True if the dataset has a groups other than the root group. - If the dataset is a Zarr dataset then an exception is raised - of the dataset has groups. - .. versionadded:: (cfdm) 1.12.2.0 :Parameters: @@ -10878,18 +10928,12 @@ def _dataset_has_groups(self, nc): `bool` """ - if self.read_vars["file_opened_with"] == "zarr": - # zarr - if len(tuple(nc.groups())) > 1: - raise ReadError( - "Can't read Zarr dataset that has groups: " - f"{self.read_vars['dataset']}" - ) - - return False + match self.read_vars["original_dataset_opened_with"]: + case "h5netcdf" | "netCDF4": + return bool(nc.groups) - # netCDF4, h5netcdf - return bool(nc.groups) + case "zarr": + return bool(tuple(nc.group_keys())) def _file_global_attribute(self, nc, attr): """Return a global attribute from a dataset. @@ -10909,12 +10953,12 @@ def _file_global_attribute(self, nc, attr): The global attribute value. """ - try: - # netCDF4 - return nc.getncattr(attr) - except AttributeError: - # h5netcdf, zarr - return nc.attrs[attr] + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "zarr": + return nc.attrs[attr] + + case "netCDF4": + return nc.getncattr(attr) def _file_global_attributes(self, nc): """Return the global attributes from a dataset. @@ -10924,7 +10968,8 @@ def _file_global_attributes(self, nc): :Parameters: nc: `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. :Returns: @@ -10933,42 +10978,71 @@ def _file_global_attributes(self, nc): names. """ - try: - # h5netcdf, zarr - return nc.attrs - except AttributeError: - # netCDF4 - return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "zarr": + return nc.attrs + + case "netCDF4": + return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} + + def _file_group_variables(self, group): + """Return all variables in a group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group. + + :Returns: + + `dict`-like + A dictionary of the variables keyed by their names. + + """ + match self.read_vars["original_dataset_opened_with"]: + case "h5netcdf" | "netCDF4": + return group.variables + + case "zarr": + return dict(group.arrays()) def _file_dimensions(self, nc): """Return all dimensions in the root group. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + nc: `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. + :Returns: `dict`-like A dictionary of the dimensions keyed by their names. """ - try: - # netCDF4, h5netcdf - return nc.dimensions - except AttributeError: - # zarr - dimensions = {} - for var in self._file_variables(nc).values(): - dimensions.update( - { - name: ZarrDimension(name, size, nc) - for name, size in zip( - self._file_variable_dimensions(var), var.shape - ) - if name not in dimensions - } - ) + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + dimensions = dict(nc.dimensions) - return dimensions + case "zarr": + dimensions = {} + for var in self._file_variables(nc).values(): + dimensions.update( + { + name: ZarrDimension(name, size, nc) + for name, size in zip( + self._file_variable_dimensions(var), var.shape + ) + if name not in dimensions + } + ) + + return dimensions def _file_dimension(self, nc, dim_name): """Return a dimension from the root group of a dataset. @@ -10978,14 +11052,15 @@ def _file_dimension(self, nc, dim_name): :Parameters: nc: `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. dim_name: `str` The dimension name. :Returns: - `netCDF.Dimension` or `h5netcdf.Dimension` + `netCDF.Dimension` or `h5netcdf.Dimension` or `ZarrDimension` The dimension. """ @@ -10993,14 +11068,15 @@ def _file_dimension(self, nc, dim_name): return self._file_dimensions(nc)[dim_name] def _file_dimension_isunlimited(self, nc, dim_name): - """Return whether a dimension is unlimited. + """Return whether a dimension in the root group is unlimited. .. versionadded:: (cfdm) 1.11.2.0 :Parameters: nc: `netCDF4.Dataset` or `h5netcdf.File` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. dim_name: `str` The dimension name. @@ -11011,12 +11087,12 @@ def _file_dimension_isunlimited(self, nc, dim_name): Whether the dimension is unlimited. """ - try: - # netCDF4, h5netcdf - return self._file_dimension(nc, dim_name).isunlimited() - except Exception: - # zarr - return False + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + return self._file_dimension(nc, dim_name).isunlimited() + + case "zarr": + return False def _file_dimension_size(self, nc, dim_name): """Return a dimension's size. @@ -11048,7 +11124,8 @@ def _file_variables(self, nc): :Parameters: nc: `netCDF4.Dataset`, `h5netcdf.File` or `zarr.Group` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. :Returns: @@ -11056,12 +11133,12 @@ def _file_variables(self, nc): A dictionary of the variables keyed by their names. """ - try: - # netCDF4, h5netcdf - return nc.variables - except AttributeError: - # zarr - return dict(nc.arrays()) + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + return nc.variables + + case "zarr": + return dict(nc.arrays()) def _file_variable(self, nc, var_name): """Return a variable. @@ -11093,7 +11170,8 @@ def _file_variable_attributes(self, var): :Parameters: var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` - The variable. + The variable. If the original dataset has groups, then + *var* is from the flattened dataset. :Returns: @@ -11102,28 +11180,32 @@ def _file_variable_attributes(self, var): names. """ - try: - # h5netcdf, zarr - attrs = dict(var.attrs) - except AttributeError: - # netCDF4 - return {attr: var.getncattr(attr) for attr in var.ncattrs()} - else: - if self.read_vars["file_opened_with"] == "zarr": - # zarr: Remove the _ARRAY_DIMENSIONS attribute - attrs.pop("_ARRAY_DIMENSIONS", None) + match self.read_vars["nc_opened_with"]: + case "h5netcdf": + return dict(var.attrs) + + case "netCDF4": + return {attr: var.getncattr(attr) for attr in var.ncattrs()} - return attrs + case "zarr": + attrs = dict(var.attrs) + + # Remove _ARRAY_DIMENSIONS from Zarr v2 attributes + if var.metadata.zarr_format == 2: + attrs.pop("_ARRAY_DIMENSIONS", None) + + return attrs def _file_variable_dimensions(self, var): """Return the variable dimension names. - .. versionadded:: (cfdm) 1.12.2.0 + .. versionadded:: (cfdm) 1.12.2.0 :Parameters: - var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` - The variable. + var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` + The variable. If the original dataset has groups, then + *var* is from the flattened dataset. :Returns: @@ -11131,47 +11213,24 @@ def _file_variable_dimensions(self, var): The dimension names. """ - try: - # netCDF4, h5netcdf - return var.dimensions - except AttributeError: - try: - # zarr v3 - dimension_names = var.metadata.dimension_names - if dimension_names is None: - # scalar variable - dimension_names = () - - return dimension_names - except AttributeError: - # zarr v2 - return tuple(var.attrs["_ARRAY_DIMENSIONS"]) + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + return var.dimensions - def _file_variable_size(self, var): - """Return the size of a variable's array. + case "zarr": + match var.metadata.zarr_format: + case 3: + # Zarr v3 + dimension_names = var.metadata.dimension_names + if dimension_names is None: + # Scalar variable + dimension_names = () - .. versionadded:: (cfdm) 1.11.2.0 + return dimension_names - :Parameters: - - var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` - The variable. - - :Returns: - - `int` - The array size. - - """ - # Use try/except here because the variable type could differ - # from that implied by the value of - # read_vars["file_opened_with"] - try: - # netCDF4, zarr - return var.size - except AttributeError: - # h5netcdf - return prod(var.shape) + case 2: + # Zarr v2 + return tuple(var.attrs["_ARRAY_DIMENSIONS"]) def _get_storage_options(self, dataset, parsed_dataset): """Get the storage options for accessing a file. @@ -11310,8 +11369,6 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): else: dask_chunks = g.get("dask_chunks", "storage-aligned") - storage_chunks = self._netcdf_chunksizes(g["variables"][ncvar]) - # ------------------------------------------------------------ # None # ------------------------------------------------------------ @@ -11319,6 +11376,8 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): # No Dask chunking return -1 + storage_chunks = self._variable_chunksizes(g["variables"][ncvar]) + ndim = array.ndim if ( storage_chunks is not None @@ -11606,7 +11665,7 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): # ------------------------------------------------------------ return dask_chunks - def _cache_data_elements(self, data, ncvar): + def _cache_data_elements(self, data, ncvar, attributes): """Cache selected element values. Updates *data* in-place to store its first, second, @@ -11634,6 +11693,11 @@ def _cache_data_elements(self, data, ncvar): The name of the netCDF variable that contains the data. + attributes: `dict` + The attributes of the netCDF variable. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `None` @@ -11651,7 +11715,7 @@ def _cache_data_elements(self, data, ncvar): group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - variable = group.variables.get(name) + variable = self._file_group_variables(group).get(name) else: variable = g["variables"].get(ncvar) @@ -11663,30 +11727,36 @@ def _cache_data_elements(self, data, ncvar): return # ------------------------------------------------------------ - # Still here? then there were no cached data elements, so we - # have to create them. + # Still here? Then there were no cached data elements, so we + # have to create them. # ------------------------------------------------------------ + + # Include optimisations for the common case that the entire + # array is stored in one dataset chunk (which does *not* + # include netCDF contiguous arrays), that prevent the reading + # of that chunk multiple times. + one_chunk = self._variable_chunksizes(variable) == variable.shape + # Get the required element values size = data.size ndim = data.ndim - # Whether or not this is an array of strings - dtype = variable.dtype - string = dtype == str - obj = not string and dtype.kind == "O" - - # Whether or not this is an array of chars - if ( - not (string or obj) - and dtype.kind in "SU" - and variable.ndim == ndim + 1 - ): - # This variable is a netCDF classic style char array with - # a trailing dimension that needs to be collapsed - char = True - else: - char = False + # Get the values using `netcdf_indexer`, as this conveniently + # deals with different type of indexing, string and character + # arrays, etc. + variable = netcdf_indexer( + variable, + mask=True, + unpack=True, + always_masked_array=False, + orthogonal_indexing=False, + attributes=attributes, + copy=False, + ) + # Get the cached values, minimising the number of "gets" on + # the dataset by not accessing the same chunk twice, where + # possible. if ndim == 1: # Also cache the second element for 1-d data, on the # assumption that they may well be dimension coordinate @@ -11694,14 +11764,21 @@ def _cache_data_elements(self, data, ncvar): if size == 1: indices = (0, -1) value = variable[...] - values = (value, value) + values = [value, value] elif size == 2: indices = (0, 1, -1) - value = variable[-1:] - values = (variable[:1], value, value) + values = variable[...].tolist() + values += [values[-1]] + elif size == 3: + indices = (0, 1, -1) + values = variable[...].tolist() else: indices = (0, 1, -1) - values = (variable[:1], variable[1:2], variable[-1:]) + if one_chunk: + values = variable[list(indices)].tolist() + else: + values = variable[:2].tolist() + [variable[-1:]] + elif ndim == 2 and data.shape[-1] == 2: # Assume that 2-d data with a last dimension of size 2 # contains coordinate bounds, for which it is useful to @@ -11709,59 +11786,46 @@ def _cache_data_elements(self, data, ncvar): # last cells. indices = (0, 1, -2, -1) ndim1 = ndim - 1 - values = ( - variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], - ) + if one_chunk: + v = variable[...] + else: + v = variable + index = (slice(0, 1),) * ndim1 + (slice(0, 2),) + values = v[index].squeeze().tolist() if data.size == 2: values = values + values else: - values += ( - variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], - ) + index = (slice(-1, None, 1),) * ndim1 + (slice(0, 2),) + values += v[index].squeeze().tolist() + + del v elif size == 1: indices = (0, -1) value = variable[...] - values = (value, value) + values = [value, value] elif size == 3: indices = (0, 1, -1) - if char: - values = variable[...].reshape(3, variable.shape[-1]) - else: - values = variable[...].flatten() + values = variable[...].flatten().tolist() else: indices = (0, -1) - values = ( - variable[(slice(0, 1),) * ndim], - variable[(slice(-1, None, 1),) * ndim], - ) + if one_chunk: + v = variable[...] + values = [v.item(0), v.item(-1)] + del v + else: + values = [ + variable[(slice(0, 1),) * ndim], + variable[(slice(-1, None, 1),) * ndim], + ] # Create a dictionary of the element values - elements = {} - for index, value in zip(indices, values): - if obj: - value = value.astype(str) - elif string: - # Convert an array of objects to an array of strings - value = np.array(value, dtype="U") - elif char: - # Variable is a netCDF classic style char array, so - # collapse (by concatenation) the outermost (fastest - # varying) dimension. E.g. [['a','b','c']] becomes - # ['abc'] - import netCDF4 - - if dtype.kind == "U": - value = value.astype("S") - - a = netCDF4.chartostring(value) - shape = a.shape - a = np.array([x.rstrip() for x in a.flat]) - a = np.reshape(a, shape) - value = np.ma.masked_where(a == "", a) - - elements[index] = value + # + # Note: some backends might give `None` for uninitialised + # data, when we want `np.ma.masked` in this case. + elements = { + index: (value if value is not None else np.ma.masked) + for index, value in zip(indices, values) + } # Cache the cached data elements for this variable g["cached_data_elements"][ncvar] = elements @@ -11769,8 +11833,8 @@ def _cache_data_elements(self, data, ncvar): # Store the elements in the data object data._set_cached_elements(elements) - def _netcdf_chunksizes(self, variable): - """Return the variable chunk sizes. + def _variable_chunksizes(self, variable): + """Return the dataset variable chunk sizes. .. versionadded:: (cfdm) 1.11.2.0 @@ -11802,8 +11866,10 @@ def _netcdf_chunksizes(self, variable): if chunks == "contiguous": chunks = None except AttributeError: - # h5netcdf + # h5netcdf, zarr chunks = variable.chunks + if not chunks: + chunks = None return chunks @@ -11979,3 +12045,53 @@ def _set_quantization(self, parent, ncvar): # Set the Quantization metadata self.implementation.set_quantization(parent, q, copy=False) + + def _get_dataset_shards(self, ncvar): + """Return a netCDF variable's dataset storage shards. + + The sharding strategy is defined as the number of dataset + chunks (*not* the number of data array elements) along each + data array axis. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ncvar: `str` + The netCDF variable name. + + :Returns: + + 2-tuple: + The variable's sharding strategy and its shape. If the + dataset is not Zarr, then (`None`, `None`) is + returned. + + + **Examples** + + >>> n._get_dataset_shards('tas') + [1, 2, 3], (12, 324, 432) + >>> n._get_dataset_chunks('pr') + None, (12, 324, 432) + + """ + g = self.read_vars + if g["original_dataset_opened_with"] != "zarr": + # Only Zarr datasets have shards + return None, None + + if g["has_groups"]: + nc = g["nc_grouped"] + else: + nc = g["nc"] + + var = nc[ncvar] + shards = var.shards + if shards is not None: + # 'shards' is currently the number of data array elements + # along each data array axis => re-cast it as the number + # of chunks along each of axis. + shards = [s // c for s, c in zip(shards, var.chunks)] + + return shards, var.shape diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 7ffee840e..ec3b803d1 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -1,6 +1,8 @@ import copy import logging import os +from math import prod +from numbers import Integral import numpy as np @@ -16,6 +18,7 @@ NETCDF4_FMTS, NETCDF_QUANTIZATION_PARAMETERS, NETCDF_QUANTIZE_MODES, + ZARR_FMTS, ) from .netcdfread import NetCDFRead @@ -23,7 +26,7 @@ class AggregationError(Exception): - """An error relating to CF-netCDF aggregation. + """An error relating to CF aggregation. .. versionadded:: (cfdm) 1.12.0.0 @@ -33,7 +36,11 @@ class AggregationError(Exception): class NetCDFWrite(IOWrite): - """A container for writing Fields to a netCDF dataset.""" + """A container for writing Fields to a netCDF dataset. + + NetCDF3, netCDF4 and Zarr output formats are supported. + + """ def __new__(cls, *args, **kwargs): """Store the NetCDFRead class.""" @@ -66,28 +73,40 @@ def cf_cell_method_qualifiers(self): """Cell method qualifiers.""" return set(("within", "where", "over", "interval", "comment")) - def _create_netcdf_group(self, nc, group_name): - """Creates a new netCDF4 group object. + def _createGroup(self, parent, group_name): + """Creates a new dataset group object. .. versionadded:: (cfdm) 1.8.6.0 :Parameters: - nc: `netCDF4._netCDF4.Group` or `netCDF4.Dataset` + parent: `netCDF4.Dataset` or `netCDF4.Group` or `Zarr.Group` + The group in which to create the new group. group_name: `str` The name of the group. :Returns: - `netCDF4._netCDF4.Group` + `netCDF4.Group` or `Zarr.Group` The new group object. """ - return nc.createGroup(group_name) + g = self.write_vars + match g["backend"]: + case "netCDF4": + return parent.createGroup(group_name) - def _create_netcdf_variable_name(self, parent, default): - """Create an appropriate name for a netCDF variable. + case "zarr": + if group_name in parent: + return parent[group_name] + + return parent.create_group( + group_name, overwrite=g["overwrite"] + ) + + def _create_variable_name(self, parent, default): + """Create an appropriate name for a dataset variable. .. versionadded:: (cfdm) 1.7.0 @@ -100,7 +119,7 @@ def _create_netcdf_variable_name(self, parent, default): :Returns: `str` - The netCDF variable name. + The dataset variable name. """ ncvar = self.implementation.nc_get_variable(parent, None) @@ -113,14 +132,14 @@ def _create_netcdf_variable_name(self, parent, default): except AttributeError: ncvar = default elif not self.write_vars["group"]: - # A flat file has been requested, so strip off any group - # structure from the name. + # A flat dataset has been requested, so strip off any + # group structure from the name. ncvar = self._remove_group_structure(ncvar) - return self._netcdf_name(ncvar) + return self._name(ncvar) - def _netcdf_name(self, base, dimsize=None, role=None): - """Return a new netCDF variable or dimension name. + def _name(self, base, dimsize=None, role=None): + """Return a new variable or dimension name for the dataset. .. versionadded:: (cfdm) 1.7.0 @@ -135,7 +154,7 @@ def _netcdf_name(self, base, dimsize=None, role=None): :Returns: `str` - NetCDF dimension name or netCDF variable name. + The name of the new dimension or variable. """ if base is None: @@ -154,7 +173,7 @@ def _netcdf_name(self, base, dimsize=None, role=None): for ncdim in g["dimensions_with_role"].get(role, ()): if g["ncdim_to_size"][ncdim] == dimsize: - # Return the name of an existing netCDF dimension + # Return the name of an existing dataset dimension # with this name, this size, and matching the # given role. return ncdim @@ -208,8 +227,8 @@ def _numpy_compressed(self, array): return array.flatten() - def _write_attributes(self, parent, ncvar, extra=None, omit=()): - """Write netCDF attributes to the netCDF file. + def _write_variable_attributes(self, parent, ncvar, extra=None, omit=()): + """Write variable attributes to the dataset. :Parameters: @@ -279,7 +298,7 @@ def _write_attributes(self, parent, ncvar, extra=None, omit=()): del netcdf_attrs["_FillValue"] if not g["dry_run"]: - g["nc"][ncvar].setncatts(netcdf_attrs) + self._set_attributes(netcdf_attrs, ncvar) if skip_set_fill_value: # Re-add as known attribute since this FV is already set @@ -289,6 +308,51 @@ def _write_attributes(self, parent, ncvar, extra=None, omit=()): return netcdf_attrs + def _set_attributes(self, attributes, ncvar=None, group=None): + """Set dataset attributes on a variable or group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + attributes: `dict` + The attributes. + + ncvar: `str`, optional + The variable on which to set the attributes. Must be + set if *group* is `None`. + + group: `str`, optional + The group on which to set the attributes. Must be set + if *ncvar* is `None`. + + :Returns: + + `None` + + """ + g = self.write_vars + if ncvar is not None: + # Set variable attributes + x = g["nc"][ncvar] + elif group is not None: + # Set group-level attributes + x = group + else: + raise ValueError("Must set ncvar or group") + + match g["backend"]: + case "netCDF4": + x.setncatts(attributes) + case "zarr": + # `zarr` can't encode numpy arrays in the zarr.json + # file + for attr, value in attributes.items(): + if isinstance(value, np.ndarray): + attributes[attr] = value.tolist() + + x.update_attributes(attributes) + def _character_array(self, array): """Converts a numpy array of strings to character data type. @@ -356,9 +420,9 @@ def _datatype(self, variable): For example, if variable.dtype is 'float32', then 'f4' will be returned. - For a NETCDF4 format file, numpy string data types will either - return `str` regardless of the numpy string length (and a - netCDF4 string type variable will be created) or, if + For a NETCDF4 format dataset, numpy string data types will + either return `str` regardless of the numpy string length (and + a netCDF4 string type variable will be created) or, if `self.write_vars['string']`` is `False`, ``'S1'`` (see below). For all other output netCDF formats (such NETCDF4_CLASSIC, @@ -369,8 +433,8 @@ def _datatype(self, variable): dimension) is expected to be done elsewhere (currently in the _write_netcdf_variable method). - If the input variable has no `!dtype` attribute (or it is None) - then 'S1' is returned, or `str` for NETCDF files. + If the input variable has no `!dtype` attribute (or it is + None) then 'S1' is returned, or `str` for NETCDF datasets. :Parameters: @@ -380,7 +444,7 @@ def _datatype(self, variable): :Returns: `str` or str - The `netCDF4.createVariable` data type corresponding to the + The `_createVariable` data type corresponding to the datatype of the array of the input variable. """ @@ -389,13 +453,20 @@ def _datatype(self, variable): if not isinstance(variable, np.ndarray): data = self.implementation.get_data(variable, None) if data is None: + if g["fmt"] == "ZARR3": + return str + return "S1" else: data = variable dtype = getattr(data, "dtype", None) if dtype is None or dtype.kind in "SU": - if g["fmt"] == "NETCDF4" and g["string"]: + fmt = g["fmt"] + if fmt == "NETCDF4" and g["string"]: + return str + + if fmt == "ZARR3": return str return "S1" @@ -407,7 +478,9 @@ def _datatype(self, variable): return f"{dtype.kind}{dtype.itemsize}" def _string_length_dimension(self, size): - """Creates a netCDF dimension for string variables if necessary. + """Return a dataset dimension for string variables. + + The dataset dimension will be created, if required. :Parameters: @@ -416,7 +489,7 @@ def _string_length_dimension(self, size): :Returns: `str` - The netCDF dimension name. + The dataset dimension name. """ g = self.write_vars @@ -424,28 +497,54 @@ def _string_length_dimension(self, size): # ------------------------------------------------------------ # Create a new dimension for the maximum string length # ------------------------------------------------------------ - ncdim = self._netcdf_name( - f"strlen{size}", dimsize=size, role="string_length" - ) + ncdim = self._name(f"strlen{size}", dimsize=size, role="string_length") if ncdim not in g["ncdim_to_size"]: # This string length dimension needs creating g["ncdim_to_size"][ncdim] = size # Define (and create if necessary) the group in which to - # place this netCDF dimension. + # place this dataset dimension. parent_group = self._parent_group(ncdim) if not g["dry_run"]: try: - parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) except RuntimeError: pass # TODO convert to 'raise' via fixes upstream return ncdim - def _netcdf_dimensions(self, field, key, construct): - """Returns the netCDF dimension names for the construct axes. + def _createDimension(self, group, ncdim, size): + """Create a dataset dimension in group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: `netCDF.Dataset` or `netCDF.Group` or `zarr.Group` + The group in which to create the dimension. + + ncdim: `str` + The name of the dimension in the group. + + size: `int` + The size of the dimension. + + :Returns: + + `None` + + """ + match self.write_vars["backend"]: + case "netCDF4": + group.createDimension(ncdim, size) + case "zarr": + # Dimensions are not created in Zarr datasets + pass + + def _dataset_dimensions(self, field, key, construct): + """Returns the dataset dimension names for the construct. The names are returned in a tuple. If the metadata construct has no data, then `None` is returned. @@ -463,7 +562,7 @@ def _netcdf_dimensions(self, field, key, construct): :Returns: `tuple` or `None` - The netCDF dimension names, or `None` if there are no + The dataset dimension names, or `None` if there are no data. """ @@ -498,8 +597,8 @@ def _netcdf_dimensions(self, field, key, construct): # ---------------------------------------------------- if sample_ncdim is None: # The list variable has not yet been written to - # the file, so write it and also get the netCDF - # name of the sample dimension. + # the dataset, so write it and also get the + # dataset name of the sample dimension. list_variable = self.implementation.get_list(construct) sample_ncdim = self._write_list_variable( field, @@ -514,10 +613,10 @@ def _netcdf_dimensions(self, field, key, construct): # Compression by contiguous ragged array # # No need to do anything because i) the count variable - # has already been written to the file, ii) we already - # have the position of the sample dimension in the - # compressed array, and iii) we already have the - # netCDF name of the sample dimension. + # has already been written to the dataset, ii) we + # already have the position of the sample dimension in + # the compressed array, and iii) we already have the + # dataset name of the sample dimension. # ---------------------------------------------------- pass @@ -526,10 +625,10 @@ def _netcdf_dimensions(self, field, key, construct): # Compression by indexed ragged array # # No need to do anything because i) the index variable - # has already been written to the file, ii) we already - # have the position of the sample dimension in the - # compressed array, and iii) we already have the - # netCDF name of the sample dimension. + # has already been written to the dataset, ii) we + # already have the position of the sample dimension in + # the compressed array, and iii) we already have the + # dataset name of the sample dimension. # ---------------------------------------------------- pass elif compression_type == "ragged indexed contiguous": @@ -550,12 +649,12 @@ def _netcdf_dimensions(self, field, key, construct): def _write_dimension( self, ncdim, f, axis=None, unlimited=False, size=None ): - """Write a netCDF dimension to the file. + """Write a dimension to the dataset. :Parameters: ncdim: `str` - The netCDF dimension name. + The dataset dimension name. f: `Field` or `Domain` @@ -579,7 +678,7 @@ def _write_dimension( if axis is not None: domain_axis = self.implementation.get_domain_axes(f)[axis] logger.info( - f" Writing {domain_axis!r} to netCDF dimension: {ncdim}" + f" Writing {domain_axis!r} to dimension: {ncdim}" ) # pragma: no cover size = self.implementation.get_domain_axis_size(f, axis) @@ -588,15 +687,16 @@ def _write_dimension( g["ncdim_to_size"][ncdim] = size # Define (and create if necessary) the group in which to place - # this netCDF dimension. + # this dataset dimension. parent_group = self._parent_group(ncdim) - if g["group"] and "/" in ncdim: + if g["group"] and "/" in ncdim and g["backend"] != "zarr": # This dimension needs to go into a sub-group so replace # its name with its basename (CF>=1.8) ncdim = self._remove_group_structure(ncdim) - if not g["dry_run"]: + # Dimensions are not created in Zarr datasets + if not g["dry_run"] and g["backend"] != "zarr": if unlimited: # Create an unlimited dimension size = None @@ -605,16 +705,16 @@ def _write_dimension( except RuntimeError as error: message = ( "Can't create unlimited dimension " - f"in {g['netcdf'].data_model} file ({error})." + f"in {g['netcdf'].data_model} dataset ({error})." ) error = str(error) if error == "NetCDF: NC_UNLIMITED size already in use": raise RuntimeError( message - + f" In a {g['netcdf'].data_model} file only one " - "unlimited dimension is allowed. Consider using " - "a netCDF4 format." + + f" In a {g['netcdf'].data_model} dataset only " + "one unlimited dimension is allowed. Consider " + "using a netCDF4 format." ) raise RuntimeError(message) @@ -626,16 +726,16 @@ def _write_dimension( except RuntimeError as error: raise RuntimeError( f"Can't create size {size} dimension {ncdim!r} in " - f"{g['netcdf'].data_model} file ({error})" + f"{g['netcdf'].data_model} dataset ({error})" ) g["dimensions"].add(ncdim) def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): - """Writes a coordinate variable and its bounds variable to file. + """Write a coordinate and bounds variables to the dataset. - This also writes a new netCDF dimension to the file and, if - required, a new netCDF dimension for the bounds. + For netCDF datasets, this also writes a new dimension to the + dataset and, if required, a new dimension for the bounds. :Parameters: @@ -646,10 +746,11 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): coord: Dimension coordinate construct ncdim: `str` or `None` - The name of the netCDF dimension for this dimension - coordinate construct, including any groups structure. Note - that the group structure may be different to the - coordinate variable, and the basename. + The name of the dataset dimension for this dimension + coordinate construct, including any groups + structure. Note that the group structure may be + different to the coordinate variable, and the + basename. coordinates: `list` This list may get updated in-place. @@ -659,7 +760,7 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): :Returns: `str` - The netCDF name of the dimension coordinate. + The dataset name of the dimension coordinate. """ g = self.write_vars @@ -682,12 +783,12 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): # coordinate. create = True - # If the dimension coordinate is already in the file but not - # in an approriate group then we have to create a new netCDF - # variable. This is to prevent a downstream error ocurring - # when the parent data variable tries to reference one of its - # netCDF dimensions that is not in the same group nor a parent - # group. + # If the dimension coordinate is already in the dataset but + # not in an approriate group then we have to create a new + # dataset variable. This is to prevent a downstream error + # ocurring when the parent data variable tries to reference + # one of its dataset dimensions that is not in the same group + # nor a parent group. if already_in_file and not create: ncvar = coord.nc_get_variable("") groups = self._groups(seen[id(coord)]["ncvar"]) @@ -695,35 +796,33 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): create = True if create: - ncvar = self._create_netcdf_variable_name(coord, default=None) + ncvar = self._create_variable_name(coord, default=None) if ncvar is None: - # No netCDF variable name has been set, so use the - # corresponding netCDF dimension name + # No dataset variable name has been set, so use the + # corresponding dataset dimension name ncvar = ncdim if ncvar is None: - # No netCDF variable name not correponding to a netCDF - # dimension name has been set, so create a default - # netCDF variable name. - ncvar = self._create_netcdf_variable_name( - coord, default="coordinate" - ) + # No dataset variable name not correponding to a + # dataset dimension name has been set, so create a + # default dataset variable name. + ncvar = self._create_variable_name(coord, default="coordinate") ncdim = ncvar - # Create a new dimension + # Create a new dataset dimension unlimited = self._unlimited(f, axis) self._write_dimension(ncdim, f, axis, unlimited=unlimited) - ncdimensions = self._netcdf_dimensions(f, key, coord) + ncdimensions = self._dataset_dimensions(f, key, coord) # If this dimension coordinate has bounds then write the - # bounds to the netCDF file and add the 'bounds' or + # bounds to the dataset and add the 'bounds' or # 'climatology' attribute (as appropriate) to a dictionary # of extra attributes extra = self._write_bounds(f, coord, key, ncdimensions, ncvar) - # Create a new dimension coordinate variable + # Create a new dimension coordinate dataset variable self._write_netcdf_variable( ncvar, ncdimensions, @@ -740,8 +839,8 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): g["axis_to_ncdim"][axis] = seen[id(coord)]["ncdims"][0] if g["coordinates"] and ncvar is not None: - # Add the dimension coordinate netCDF variable name to the - # 'coordinates' attribute + # Add the dimension coordinate dataset variable name to + # the 'coordinates' attribute coordinates.append(ncvar) return ncvar @@ -749,16 +848,14 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): def _write_count_variable( self, f, count_variable, ncdim=None, create_ncdim=True ): - """Write a count variable to the netCDF file.""" + """Write a count variable to the dataset.""" g = self.write_vars if not self._already_in_file(count_variable): - ncvar = self._create_netcdf_variable_name( - count_variable, default="count" - ) + ncvar = self._create_variable_name(count_variable, default="count") if create_ncdim: - ncdim = self._netcdf_name(ncdim) + ncdim = self._name(ncdim) self._write_dimension( ncdim, f, @@ -769,10 +866,10 @@ def _write_count_variable( # -------------------------------------------------------- # Create the sample dimension # -------------------------------------------------------- - _ = self.implementation.nc_get_sample_dimension( + sample_ncdim = self.implementation.nc_get_sample_dimension( count_variable, "element" ) - sample_ncdim = self._netcdf_name(_) + sample_ncdim = self._name(sample_ncdim) self._write_dimension( sample_ncdim, f, @@ -803,7 +900,7 @@ def _write_index_variable( create_ncdim=True, instance_dimension=None, ): - """Write an index variable to the netCDF file. + """Write an index variable to the dataset. :Parameters: @@ -812,30 +909,28 @@ def _write_index_variable( index_variable: Index variable sample_dimension: `str` - The name of the netCDF sample dimension. + The name of the dataset sample dimension. ncdim: `str`, optional create_ncdim: bool, optional instance_dimension: `str`, optional - The name of the netCDF instance dimension. + The name of the dataset instance dimension. :Returns: `str` - The name of the netCDF sample dimension. + The name of the dataset sample dimension. """ g = self.write_vars if not self._already_in_file(index_variable): - ncvar = self._create_netcdf_variable_name( - index_variable, default="index" - ) + ncvar = self._create_variable_name(index_variable, default="index") if create_ncdim: - ncdim = self._netcdf_name(ncdim) + ncdim = self._name(ncdim) self._write_dimension( ncdim, f, @@ -856,15 +951,13 @@ def _write_index_variable( return sample_dimension def _write_list_variable(self, f, list_variable, compress): - """Write a list variable to the netCDF file.""" + """Write a list variable to the dataset.""" g = self.write_vars create = not self._already_in_file(list_variable) if create: - ncvar = self._create_netcdf_variable_name( - list_variable, default="list" - ) + ncvar = self._create_variable_name(list_variable, default="list") # Create a new dimension self._write_dimension( @@ -885,10 +978,10 @@ def _write_list_variable(self, f, list_variable, compress): return ncvar def _write_scalar_data(self, f, value, ncvar): - """Write a dimension coordinate and bounds to the netCDF file. + """Write a dimension coordinate and bounds to the dataset. - This also writes a new netCDF dimension to the file and, if - required, a new netCDF bounds dimension. + For netCDF datasets, this also writes a new dimension to the + dataset and, if required, a new bounds dimension. .. note:: This function updates ``g['seen']``. @@ -901,7 +994,7 @@ def _write_scalar_data(self, f, value, ncvar): :Returns: `str` - The netCDF name of the scalar data variable + The dataset name of the scalar data variable """ g = self.write_vars @@ -911,7 +1004,7 @@ def _write_scalar_data(self, f, value, ncvar): create = not self._already_in_file(value, ncdims=()) if create: - ncvar = self._netcdf_name(ncvar) # DCH ? + ncvar = self._name(ncvar) # DCH ? # Create a new dimension coordinate variable self._write_netcdf_variable(ncvar, (), value, None) @@ -921,7 +1014,7 @@ def _write_scalar_data(self, f, value, ncvar): return ncvar def _create_geometry_container(self, field): - """Create a geometry container variable in the netCDF file. + """Create a geometry container variable in the dataset. .. versionadded:: (cfdm) 1.8.0 @@ -932,7 +1025,7 @@ def _create_geometry_container(self, field): :Returns: `dict` - A representation off the CF-netCDF geometry container + A representation of the CF geometry container variable for field construct. If there is no geometry container then the dictionary is empty. @@ -969,7 +1062,7 @@ def _create_geometry_container(self, field): try: coord_ncvar = g["seen"][id(coord)]["ncvar"] except KeyError: - # There is no netCDF auxiliary coordinate variable + # There is no auxiliary coordinate dataset variable pass else: gc[geometry_id].setdefault("coordinates", []).append( @@ -1090,13 +1183,13 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): Specifically, returns True if a variable is logically equal any variable in the g['seen'] dictionary. - If this is the case then the variable has already been written to - the output netCDF file and so we don't need to do it again. + If this is the case then the variable has already been written + to the output dataset and so we don't need to do it again. - If 'ncdims' is set then a extra condition for equality is applied, - namely that of 'ncdims' being equal to the netCDF dimensions - (names and order) to that of a variable in the g['seen'] - dictionary. + If 'ncdims' is set then a extra condition for equality is + applied, namely that of 'ncdims' being equal to the dataset + dimensions (names and order) to that of a variable in the + g['seen'] dictionary. When `True` is returned, the input variable is added to the g['seen'] dictionary. @@ -1122,7 +1215,7 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): `bool` `True` if the variable has already been written to the - file, `False` otherwise. + dataset, `False` otherwise. """ g = self.write_vars @@ -1131,9 +1224,9 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): for value in seen.values(): if ncdims is not None and ncdims != value["ncdims"]: - # The netCDF dimensions (names and order) of the input - # variable are different to those of this variable in - # the 'seen' dictionary + # The dataset dimensions (names and order) of the + # input variable are different to those of this + # variable in the 'seen' dictionary continue # Still here? @@ -1150,14 +1243,14 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): return False def _write_geometry_container(self, field, geometry_container): - """Write a netCDF geometry container variable. + """Write a geometry container variable to the dataset. .. versionadded:: (cfdm) 1.8.0 :Returns: `str` - The netCDF variable name for the geometry container. + The dataset variable name for the geometry container. """ g = self.write_vars @@ -1167,11 +1260,12 @@ def _write_geometry_container(self, field, geometry_container): # Use this existing geometry container return ncvar - # Still here? Then write the geometry container to the file + # Still here? Then write the geometry container to the + # dataset. ncvar = self.implementation.nc_get_geometry_variable( field, default="geometry_container" ) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) logger.info( f" Writing geometry container variable: {ncvar}" @@ -1181,15 +1275,13 @@ def _write_geometry_container(self, field, geometry_container): kwargs = { "varname": ncvar, "datatype": "S1", - "dimensions": (), "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) if not g["dry_run"]: self._createVariable(**kwargs) - - g["nc"][ncvar].setncatts(geometry_container) + self._set_attributes(geometry_container, ncvar) # Update the 'geometry_containers' dictionary g["geometry_containers"][ncvar] = geometry_container @@ -1199,11 +1291,10 @@ def _write_geometry_container(self, field, geometry_container): def _write_bounds( self, f, coord, coord_key, coord_ncdimensions, coord_ncvar=None ): - """Creates a bounds netCDF variable and returns its name. + """Creates a bounds dataset variable. - Specifically, creates a bounds netCDF variable, creating a new - bounds netCDF dimension if required. Returns the bounds - variable's netCDF variable name. + For netCDF datasets, also creates a new bounds dimension if + required. .. versionadded:: (cfdm) 1.7.0 @@ -1217,11 +1308,12 @@ def _write_bounds( The coordinate construct key. coord_ncdimensions: `tuple` of `str` - The ordered netCDF dimension names of the coordinate's - dimensions (which do not include the bounds dimension). + The ordered dataset dimension names of the + coordinate's dimensions (which do not include the + bounds dimension). coord_ncvar: `str` - The netCDF variable name of the parent variable + The dataset variable name of the parent variable :Returns: @@ -1267,20 +1359,15 @@ def _write_bounds( size = data.shape[-1] - # bounds_ncdim = self._netcdf_name('bounds{0}'.format(size), - # dimsize=size, role='bounds') - bounds_ncdim = self.implementation.nc_get_dimension( bounds, f"bounds{size}" ) if not g["group"]: - # A flat file has been requested, so strip off any group - # structure from the name. + # A flat dataset has been requested, so strip off any + # group structure from the name. bounds_ncdim = self._remove_group_structure(bounds_ncdim) - bounds_ncdim = self._netcdf_name( - bounds_ncdim, dimsize=size, role="bounds" - ) + bounds_ncdim = self._name(bounds_ncdim, dimsize=size, role="bounds") # Check if this bounds variable has not been previously # created. @@ -1295,14 +1382,14 @@ def _write_bounds( ncdim_to_size = g["ncdim_to_size"] if bounds_ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} netCDF dimension for " + f" Writing size {size} dimension for " f"bounds: {bounds_ncdim}" ) # pragma: no cover ncdim_to_size[bounds_ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(bounds_ncdim) if g["group"] and "/" in bounds_ncdim: @@ -1316,11 +1403,13 @@ def _write_bounds( if not g["dry_run"]: try: - parent_group.createDimension(base_bounds_ncdim, size) + self._createDimension( + parent_group, base_bounds_ncdim, size + ) except RuntimeError: raise - # Set the netCDF bounds variable name + # Set the bounds dataset variable name default = coord_ncvar + "_bounds" else: default = "bounds" @@ -1330,11 +1419,11 @@ def _write_bounds( ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name (for now). ncvar = self._remove_group_structure(ncvar) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) # If no groups have been set on the bounds, then put the # bounds variable in the same group as its parent @@ -1353,7 +1442,7 @@ def _write_bounds( if self.implementation.has_property(coord, prop): omit.append(prop) - # Create the bounds netCDF variable + # Create the bounds dataset variable self._write_netcdf_variable( ncvar, ncdimensions, @@ -1381,14 +1470,14 @@ def _write_bounds( def _write_node_coordinates( self, f, coord, coord_ncvar, coord_ncdimensions ): - """Create a netCDF node coordinates variable. + """Create a node coordinates dataset variable. This will create: - * A netCDF node dimension, if required. - * A netCDF node count variable, if required. - * A netCDF part node count variable, if required. - * A netCDF interior ring variable, if required. + * A dataset node dimension, if required. + * A dataset node count variable, if required. + * A dataset part node count variable, if required. + * A dataset interior ring variable, if required. .. versionadded:: (cfdm) 1.8.0 @@ -1443,10 +1532,10 @@ def _write_node_coordinates( nodes, inherited_properties ) - # Find the base of the netCDF part dimension name + # Find the base of the 'part' dataset dimension name size = self.implementation.get_data_size(nodes) ncdim = self._get_node_ncdimension(nodes, default="node") - ncdim = self._netcdf_name(ncdim, dimsize=size, role="node") + ncdim = self._name(ncdim, dimsize=size, role="node") create = True if self._already_in_file(nodes, (ncdim,)): @@ -1465,7 +1554,7 @@ def _write_node_coordinates( create = False # We need to log the original Bounds variable as being - # in the file, too. This is so that the geometry + # in the dataset, too. This is so that the geometry # container variable can be created later on. g["seen"][id(bounds)] = { "ncvar": ncvar, @@ -1486,13 +1575,13 @@ def _write_node_coordinates( if ncdim not in ncdim_to_size: size = self.implementation.get_data_size(nodes) logger.info( - f" Writing size {size} netCDF node dimension: {ncdim}" + f" Writing size {size} geometry node dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(ncdim) if g["group"] and "/" in ncdim: @@ -1501,9 +1590,10 @@ def _write_node_coordinates( ncdim = self._remove_group_structure(ncdim) if not g["dry_run"]: - parent_group.createDimension(ncdim, size) + # parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) - # Set an appropriate default netCDF node coordinates + # Set an appropriate default node coordinates dataset # variable name axis = self.implementation.get_property(bounds, "axis") if axis is not None: @@ -1515,13 +1605,13 @@ def _write_node_coordinates( bounds, default=default ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncvar = self._remove_group_structure(ncvar) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF node coordinates variable + # Create the node coordinates dataset variable self._write_netcdf_variable( ncvar, (ncdim,), @@ -1550,7 +1640,7 @@ def _write_node_coordinates( g["geometry_encoding"][ncvar] = encodings # We need to log the original Bounds variable as being in - # the file, too. This is so that the geometry container + # the dataset, too. This is so that the geometry container # variable can be created later on. g["seen"][id(bounds)] = { "ncvar": ncvar, @@ -1566,7 +1656,7 @@ def _write_node_coordinates( def _write_node_count( self, f, coord, bounds, coord_ncdimensions, encodings ): - """Create a netCDF node count variable. + """Create a node count dataset variable. .. versionadded:: (cfdm) 1.8.0 @@ -1577,7 +1667,7 @@ def _write_node_count( bounds: coord_ncdimensions: sequence of `str` - The netCDF instance dimension + The dataset instance dimension encodings: `dict` Ignored. @@ -1609,7 +1699,7 @@ def _write_node_count( count = self.implementation.initialise_Count() self.implementation.set_data(count, data, copy=False) - # Find the base of the netCDF node count variable name + # Find the base of the node count dataset variable name nc = self.implementation.get_node_count(coord) if nc is not None: @@ -1618,7 +1708,7 @@ def _write_node_count( ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncvar = self._remove_group_structure(ncvar) @@ -1640,12 +1730,12 @@ def _write_node_count( # created, so create it now. if geometry_dimension not in g["ncdim_to_size"]: raise ValueError( - "The netCDF geometry dimension should already exist ..." + "The dataset geometry dimension should already exist ..." ) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF node count variable + # Create the node count dataset variable self._write_netcdf_variable( ncvar, (geometry_dimension,), count, None ) @@ -1656,27 +1746,27 @@ def _write_node_count( def _get_part_ncdimension(self, coord, default=None): """Gets dimension name for part node counts or interior rings. - Specifically, gets the base of the netCDF dimension for part + Specifically, gets the base of the dataset dimension for part node count and interior ring variables. .. versionadded:: (cfdm) 1.8.0 :Returns: - The netCDF dimension name, or else the value of the *default* - parameter. + The dataset dimension name, or else the value of the + *default* parameter. """ ncdim = None pnc = self.implementation.get_part_node_count(coord) if pnc is not None: - # Try to get the netCDF dimension from a part node count + # Try to get the dataset dimension from a part node count # variable ncdim = self.implementation.nc_get_dimension(pnc, default=None) if ncdim is None: - # Try to get the netCDF dimension from an interior ring + # Try to get the dataset dimension from an interior ring # variable interior_ring = self.implementation.get_interior_ring(coord) if interior_ring is not None: @@ -1685,9 +1775,9 @@ def _get_part_ncdimension(self, coord, default=None): ) if ncdim is not None: - # Found a netCDF dimension + # Found a dataset dimension if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncdim = self._remove_group_structure(ncdim) @@ -1707,27 +1797,27 @@ def _parent_group(self, name): :Parameters: name: `str` - The name of the netCDF dimension or variable. + The name of the dataset dimension or variable. :Returns: - `netCDF.Dataset` or `netCDF._netCDF4.Group` + `netCDF.Dataset` or `netCDF.Group` or `zarr.Group` """ g = self.write_vars - parent_group = g["netcdf"] + parent_group = g["dataset"] if not g["group"] or "/" not in name: return parent_group if not name.startswith("/"): raise ValueError( - f"Invalid netCDF name {name!r}: missing a leading '/'" + f"Invalid dataset name {name!r}: missing a leading '/'" ) for group_name in name.split("/")[1:-1]: - parent_group = self._write_group(parent_group, group_name) + parent_group = self._createGroup(parent_group, group_name) return parent_group @@ -1802,7 +1892,7 @@ def _groups(self, name): return groups def _get_node_ncdimension(self, bounds, default=None): - """Get the netCDF dimension from a node count variable. + """Get the dataset dimension from a node count variable. .. versionadded:: (cfdm) 1.8.0 @@ -1814,15 +1904,15 @@ def _get_node_ncdimension(self, bounds, default=None): :Returns: - The netCDF dimension name, or else the value of the *default* + The dimension name, or else the value of the *default* parameter. """ ncdim = self.implementation.nc_get_dimension(bounds, default=None) if ncdim is not None: - # Found a netCDF dimension + # Found a dimension if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncdim = self._remove_group_structure(ncdim) @@ -1832,11 +1922,7 @@ def _get_node_ncdimension(self, bounds, default=None): return default def _write_part_node_count(self, f, coord, bounds, encodings): - """Creates a bounds netCDF variable and returns its name. - - Create a bounds netCDF variable, creating a new bounds netCDF - dimension if required. Return the bounds variable's netCDF - variable name. + """Creates a part node count variable and returns its name. .. versionadded:: (cfdm) 1.8.0 @@ -1844,9 +1930,6 @@ def _write_part_node_count(self, f, coord, bounds, encodings): coord: - coord_ncvar: `str` - The netCDF variable name of the parent variable - :Returns: `dict` @@ -1882,14 +1965,14 @@ def _write_part_node_count(self, f, coord, bounds, encodings): count = self.implementation.initialise_Count() self.implementation.set_data(count, data, copy=False) - # Find the base of the netCDF part_node_count variable name + # Find the base of the dataset part_node_count variable name pnc = self.implementation.get_part_node_count(coord) if pnc is not None: ncvar = self.implementation.nc_get_variable( pnc, default="part_node_count" ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncvar = self._remove_group_structure(ncvar) @@ -1900,7 +1983,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): else: ncvar = "part_node_count" - # Find the base of the netCDF part dimension name + # Find the base of the dataset part dimension name size = self.implementation.get_data_size(count) if g["part_ncdim"] is not None: ncdim = g["part_ncdim"] @@ -1908,7 +1991,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): ncdim = encodings["part_ncdim"] else: ncdim = self._get_part_ncdimension(coord, default="part") - ncdim = self._netcdf_name(ncdim, dimsize=size, role="part") + ncdim = self._name(ncdim, dimsize=size, role="part") if self._already_in_file(count, (ncdim,)): # This part node count variable has been previously @@ -1918,13 +2001,14 @@ def _write_part_node_count(self, f, coord, bounds, encodings): ncdim_to_size = g["ncdim_to_size"] if ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} netCDF part " f"dimension{ncdim}" + f" Writing size {size} geometry part " + f"dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(ncdim) if g["group"] and "/" in ncdim: @@ -1933,11 +2017,12 @@ def _write_part_node_count(self, f, coord, bounds, encodings): ncdim = self._remove_group_structure(ncdim) if not g["dry_run"]: - parent_group.createDimension(ncdim, size) + # parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF part_node_count variable + # Create the dataset part_node_count variable self._write_netcdf_variable(ncvar, (ncdim,), count, None) g["part_ncdim"] = ncdim @@ -1946,7 +2031,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): return {"part_node_count": ncvar, "part_ncdim": ncdim} def _write_interior_ring(self, f, coord, bounds, encodings): - """Write an interior ring variable to the netCDF file. + """Write an interior ring variable to the dataset. .. versionadded:: (cfdm) 1.8.0 @@ -1955,7 +2040,7 @@ def _write_interior_ring(self, f, coord, bounds, encodings): coord: coord_ncvar: `str` - The netCDF variable name of the parent variable + The dataset variable name of the parent variable encodings: @@ -1985,8 +2070,8 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any group - # structure from the name. + # A flat dataset has been requested, so strip off any + # group structure from the name. ncvar = self._remove_group_structure(ncvar) size = self.implementation.get_data_size(interior_ring) @@ -1996,7 +2081,7 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ncdim = encodings["part_ncdim"] else: ncdim = self._get_part_ncdimension(coord, default="part") - ncdim = self._netcdf_name(ncdim, dimsize=size, role="part") + ncdim = self._name(ncdim, dimsize=size, role="part") if self._already_in_file(interior_ring, (ncdim,)): # This interior ring variable has been previously created, @@ -2006,12 +2091,13 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ncdim_to_size = g["ncdim_to_size"] if ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} netCDF part " f"dimension{ncdim}" + f" Writing size {size} geometry part " + f"dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(ncdim) if g["group"] and "/" in ncdim: @@ -2020,11 +2106,12 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ncdim = self._remove_group_structure(ncdim) if not g["dry_run"]: - parent_group.createDimension(ncdim, size) + # parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF interior ring variable + # Create the dataset interior ring variable self._write_netcdf_variable( ncvar, (ncdim,), @@ -2041,13 +2128,13 @@ def _write_interior_ring(self, f, coord, bounds, encodings): def _write_scalar_coordinate( self, f, key, coord_1d, axis, coordinates, extra=None ): - """Write a scalar coordinate and its bounds to the netCDF file. + """Write a scalar coordinate and its bounds to the dataset. - It is assumed that the input coordinate is has size 1, but this is not - checked. + It is assumed that the input coordinate has size 1, but this + is not checked. - If an equal scalar coordinate has already been written to the file - then the input coordinate is not written. + If an equal scalar coordinate has already been written to the + dataset then the input coordinate is not written. :Parameters: @@ -2065,7 +2152,8 @@ def _write_scalar_coordinate( :Returns: coordinates: `list` - The updated list of netCDF auxiliary coordinate names. + The updated list of auxiliary coordinate dataset + variable names. """ # To avoid mutable default argument (an anti-pattern) of extra={} @@ -2079,11 +2167,9 @@ def _write_scalar_coordinate( scalar_coord = self.implementation.squeeze(coord_1d, axes=0) if not self._already_in_file(scalar_coord, ()): - ncvar = self._create_netcdf_variable_name( - scalar_coord, default="scalar" - ) + ncvar = self._create_variable_name(scalar_coord, default="scalar") # If this scalar coordinate has bounds then create the - # bounds netCDF variable and add the 'bounds' or + # bounds dataset variable and add the 'bounds' or # 'climatology' (as appropriate) attribute to the # dictionary of extra attributes bounds_extra = self._write_bounds(f, scalar_coord, key, (), ncvar) @@ -2099,7 +2185,7 @@ def _write_scalar_coordinate( else: # This scalar coordinate has already been written to the - # file + # dataset ncvar = g["seen"][id(scalar_coord)]["ncvar"] g["axis_to_ncscalar"][axis] = ncvar @@ -2111,10 +2197,10 @@ def _write_scalar_coordinate( return coordinates def _write_auxiliary_coordinate(self, f, key, coord, coordinates): - """Write auxiliary coordinates and bounds to the netCDF file. + """Write auxiliary coordinates and bounds to the dataset. - If an equal auxiliary coordinate has already been written to the file - then the input coordinate is not written. + If an equal auxiliary coordinate has already been written to + the dataset then the input coordinate is not written. :Parameters: @@ -2131,16 +2217,16 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): :Returns: `list` - The list of netCDF auxiliary coordinate names updated in - place. + The list of auxiliary coordinate dataset variable + names updated in place. """ g = self.write_vars ncvar = None - # The netCDF dimensions for the auxiliary coordinate variable - ncdimensions = self._netcdf_dimensions(f, key, coord) + # The dataset dimensions for the auxiliary coordinate variable + ncdimensions = self._dataset_dimensions(f, key, coord) coord = self._change_reference_datetime(coord) @@ -2176,14 +2262,12 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): f, coord, key, ncdimensions, coord_ncvar=None ) else: - ncvar = self._create_netcdf_variable_name( - coord, default="auxiliary" - ) + ncvar = self._create_variable_name(coord, default="auxiliary") # TODO: move setting of bounds ncvar to here - why? # If this auxiliary coordinate has bounds then create - # the bounds netCDF variable and add the 'bounds', + # the bounds dataset variable and add the 'bounds', # 'climatology' or 'nodes' attribute (as appropriate) # to the dictionary of extra attributes. extra = self._write_bounds(f, coord, key, ncdimensions, ncvar) @@ -2207,10 +2291,10 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): return coordinates def _write_domain_ancillary(self, f, key, anc): - """Write a domain ancillary and its bounds to the netCDF file. + """Write a domain ancillary and its bounds to the dataset. - If an equal domain ancillary has already been written to the file - athen it is not re-written. + If an equal domain ancillary has already been written to the + dataset then it is not re-written. .. versionadded:: (cfdm) 1.7.0 @@ -2226,7 +2310,7 @@ def _write_domain_ancillary(self, f, key, anc): :Returns: `str` - The netCDF variable name of the domain ancillary variable. + The dataset name of the domain ancillary variable. """ g = self.write_vars @@ -2234,19 +2318,19 @@ def _write_domain_ancillary(self, f, key, anc): if g["post_dry_run"]: logger.warning( "At present domain ancillary constructs of appended fields " - "may not be handled correctly by netCDF write append mode " + "may not be handled correctly by write append mode " "and can appear as extra fields. Set them on fields using " "`set_domain_ancillary` and similar methods if required." ) - ncdimensions = self._netcdf_dimensions(f, key, anc) + ncdimensions = self._dataset_dimensions(f, key, anc) create = not self._already_in_file(anc, ncdimensions, ignore_type=True) if not create: ncvar = g["seen"][id(anc)]["ncvar"] else: - # See if we can set the default netCDF variable name to + # See if we can set the default dataset variable name to # its formula_terms term default = None for ref in self.implementation.get_coordinate_references( @@ -2268,10 +2352,10 @@ def _write_domain_ancillary(self, f, key, anc): if default is None: default = "domain_ancillary" - ncvar = self._create_netcdf_variable_name(anc, default=default) + ncvar = self._create_variable_name(anc, default=default) - # If this domain ancillary has bounds then create the bounds - # netCDF variable + # If this domain ancillary has bounds then create the + # bounds dataset variable self._write_bounds(f, anc, key, ncdimensions, ncvar) # Create a new domain ancillary variable @@ -2293,10 +2377,10 @@ def _write_field_ancillary( key, anc, ): - """Write a field ancillary to the netCDF file. + """Write a field ancillary to the dataset. - If an equal field ancillary has already been written to the file - then it is not re-written. + If an equal field ancillary has already been written to the + dataset then it is not re-written. :Parameters: @@ -2309,7 +2393,7 @@ def _write_field_ancillary( :Returns: `str` - The netCDF variable name of the field ancillary + The dataset variable name of the field ancillary object. If no ancillary variable was written then an empty string is returned. @@ -2320,16 +2404,14 @@ def _write_field_ancillary( """ g = self.write_vars - ncdimensions = self._netcdf_dimensions(f, key, anc) + ncdimensions = self._dataset_dimensions(f, key, anc) create = not self._already_in_file(anc, ncdimensions) if not create: ncvar = g["seen"][id(anc)]["ncvar"] else: - ncvar = self._create_netcdf_variable_name( - anc, default="ancillary_data" - ) + ncvar = self._create_variable_name(anc, default="ancillary_data") # Create a new field ancillary variable self._write_netcdf_variable( @@ -2345,10 +2427,10 @@ def _write_field_ancillary( return ncvar def _write_cell_measure(self, f, key, cell_measure): - """Write a cell measure construct to the netCDF file. + """Write a cell measure construct to the dataset. - If an identical construct has already in the file then the cell - measure will not be written. + If an identical construct is already in the dataset then the + cell measure will not be written. :Parameters: @@ -2372,11 +2454,11 @@ def _write_cell_measure(self, f, key, cell_measure): measure = self.implementation.get_measure(cell_measure) if measure is None: raise ValueError( - "Can't create a CF-netCDF cell measure variable " + "Can't create a CF cell measure variable " "without a 'measure' property" ) - ncdimensions = self._netcdf_dimensions(f, key, cell_measure) + ncdimensions = self._dataset_dimensions(f, key, cell_measure) if self._already_in_file(cell_measure, ncdimensions): # Use existing cell measure variable @@ -2388,19 +2470,20 @@ def _write_cell_measure(self, f, key, cell_measure): ) if ncvar is None: raise ValueError( - "Can't create an external CF-netCDF cell measure " - "variable without a netCDF variable name" + "Can't create an external CF cell measure " + "variable without a dataset variable name" ) # Add ncvar to the global external_variables attribute self._set_external_variables(ncvar) if ( - g["external_file"] is not None + g["external_dataset"] is not None and self.implementation.get_data(cell_measure, None) is not None ): - # Create a new field to write out to the external file + # Create a new field to write out to the external + # dataset self._create_external( field=f, construct_id=key, @@ -2408,7 +2491,7 @@ def _write_cell_measure(self, f, key, cell_measure): ncdimensions=ncdimensions, ) else: - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( cell_measure, default="cell_measure" ) @@ -2438,14 +2521,19 @@ def _set_external_variables(self, ncvar): if ncvar not in external_variables: external_variables.add(ncvar) if not g["dry_run"] and not g["post_dry_run"]: - g["netcdf"].setncattr( - "external_variables", " ".join(sorted(external_variables)) + self._set_attributes( + { + "external_variables": " ".join( + sorted(external_variables) + ) + }, + group=g["dataset"], ) def _create_external( self, field=None, construct_id=None, ncvar=None, ncdimensions=None ): - """Creates a new field to flag to write to an external file. + """Creates a new field to flag to write to an external dataset. .. versionadded:: (cfdm) 1.7.0 @@ -2460,7 +2548,7 @@ def _create_external( field=field, construct_id=construct_id ) - # Set the correct netCDF variable and dimension names + # Set the correct dataset variable and dimension names self.implementation.nc_set_variable(external, ncvar) external_domain_axes = self.implementation.get_domain_axes(external) @@ -2484,26 +2572,133 @@ def _create_external( return external def _createVariable(self, **kwargs): - """Create a variable in the netCDF file. + """Create a variable in the dataset. .. versionadded:: (cfdm) 1.7.0 """ g = self.write_vars + ncvar = kwargs["varname"] - g["nc"][ncvar] = g["netcdf"].createVariable(**kwargs) + + match g["backend"]: + case "netCDF4": + netcdf4_kwargs = kwargs + if "dimensions" not in kwargs: + netcdf4_kwargs["dimensions"] = () + + contiguous = kwargs.get("contiguous") + + is_netcdf4 = g["dataset"].data_model.startswith("NETCDF4") + if is_netcdf4 and contiguous: + # NETCDF4 contiguous variables can't be compressed + kwargs["compression"] = None + kwargs["complevel"] = 0 + + # NETCDF4 contiguous variables can't span unlimited + # dimensions + unlimited_dimensions = g[ + "unlimited_dimensions" + ].intersection(kwargs.get("dimensions", ())) + if unlimited_dimensions: + data_model = g["dataset"].data_model + raise ValueError( + f"Can't create variable {ncvar!r} in " + f"{data_model} dataset: " + f"In {data_model} it is not allowed to write " + "contiguous (as opposed to chunked) data " + "that spans one or more unlimited dimensions: " + f"{unlimited_dimensions}" + ) + + if contiguous: + netcdf4_kwargs.pop("fletcher32", None) + + # Remove Zarr-specific kwargs + netcdf4_kwargs.pop("shape", None) + netcdf4_kwargs.pop("shards", None) + + variable = g["dataset"].createVariable(**netcdf4_kwargs) + + case "zarr": + shape = kwargs.get("shape", ()) + chunks = kwargs.get("chunksizes", shape) + shards = kwargs.get("shards") + + if chunks is None: + # One chunk for the entire array + chunks = shape + + if shards is not None: + # Create the shard shape in the format expected by + # `zarr.create_array`. 'shards' is currently + # defined by how many *chunks* along each + # dimension are in each shard, but `zarr` requires + # shards defined by how many *array elements* + # along each dimension are in each shard. + if chunks == shape: + # One chunk + # + # It doesn't matter what 'shards' is, because + # the data only has one chunk. + shards = None + else: + ndim = len(chunks) + if isinstance(shards, Integral): + # Make a conservative estimate of how many + # whole chunks along each dimension are in + # a shard. This may result in fewer than + # 'shards' chunks in each shard, but is + # guaranteed to give us a shard shape of + # less than the data shape, which is a + # `zarr` requirement. + n = int(shards ** (1 / ndim)) + shards = (n,) * ndim + + if prod(shards) > 1: + # More than one chunk per shard. + # + # E.g. shards=(10, 11, 12) and chunks=(10, + # 20, 30) => shards=(100, 220, 360) + shards = [c * n for c, n in zip(chunks, shards)] + else: + # One chunk per shard. + # + # E.g. shards=(1, 1, 1) => shards=None + shards = None + + dtype = kwargs["datatype"] + if dtype == "S1": + dtype = str + + zarr_kwargs = { + "name": ncvar, + "shape": shape, + "dtype": dtype, + "chunks": chunks, + "shards": shards, + "fill_value": kwargs.get("fill_value"), + "dimension_names": kwargs.get("dimensions", ()), + "storage_options": g.get("storage_options"), + "overwrite": g["overwrite"], + } + + variable = g["dataset"].create_array(**zarr_kwargs) + + g["nc"][ncvar] = variable def _write_grid_mapping(self, f, ref, multiple_grid_mappings): - """Write a grid mapping georeference to the netCDF file. + """Write a grid mapping georeference to the dataset. .. versionadded:: (cfdm) 1.7.0 :Parameters: - f: Field construct + f: `Field` or `Domain` - ref: Coordinate reference construct - The grid mapping coordinate reference to write to the file. + ref: `CoordinateReference` + The grid mapping coordinate reference to write to the + dataset. multiple_grid_mappings: `bool` @@ -2524,16 +2719,15 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): self.implementation.get_coordinate_conversion_parameters(ref) ) default = cc_parameters.get("grid_mapping_name", "grid_mapping") - ncvar = self._create_netcdf_variable_name(ref, default=default) + ncvar = self._create_variable_name(ref, default=default) logger.info( - f" Writing {ref!r} to netCDF variable: {ncvar}" + f" Writing {ref!r} to variable: {ncvar}" ) # pragma: no cover kwargs = { "varname": ncvar, "datatype": "S1", - "dimensions": (), "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) @@ -2547,7 +2741,7 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): common = set(parameters).intersection(cc_parameters) if common: raise ValueError( - "Can't create CF-netCDF grid mapping variable: " + "Can't create CF grid mapping variable: " f"{common.pop()!r} is defined as both a coordinate " "conversion and a datum parameter." ) @@ -2567,13 +2761,13 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): parameters[term] = value if not g["dry_run"]: - g["nc"][ncvar].setncatts(parameters) + self._set_attributes(parameters, ncvar) # Update the 'seen' dictionary g["seen"][id(ref)] = { "variable": ref, "ncvar": ncvar, - # Grid mappings have no netCDF dimensions + # Grid mappings variables are scalar "ncdims": (), } @@ -2626,7 +2820,7 @@ def _write_netcdf_variable( The netCDF dimension names of the variable cfvar: `Variable` or `Data` - The construct to write to the netCDF file. + The construct to write to the dataset. domain_axes: `None`, or `tuple` of `str` The domain axis construct identifiers for *cfvar*. @@ -2648,11 +2842,11 @@ def _write_netcdf_variable( .. versionadded:: (cfdm) 1.10.1.0 - chunking: sequence of `int`, optional - Set `netCDF4.createVariable` 'contiguous' and - `chunksizes` parameters (in that order). If not set - (the default), then these parameters are inferred from - the data. + chunking: sequence, optional + Set `_createVariable` 'contiguous', 'chunksizes', and + 'shards' parameters (in that order). If `None` (the + default), then these parameters are inferred from the + data. .. versionadded:: (cfdm) 1.12.0.0 @@ -2691,7 +2885,9 @@ def _write_netcdf_variable( if g["dry_run"]: return - logger.info(f" Writing {cfvar!r}") # pragma: no cover + logger.info( + f" Writing {cfvar!r} to variable: {ncvar}" + ) # pragma: no cover # Set 'construct_type' if not construct_type: @@ -2699,12 +2895,12 @@ def _write_netcdf_variable( # Do this after the dry_run return else may attempt to transform # the arrays with string dtype on an append-mode read iteration (bad). + datatype = None if not domain_variable: datatype = self._datatype(cfvar) data, ncdimensions = self._transform_strings( data, ncdimensions, - # cfvar, data, ncdimensions ) # Whether or not to write the data @@ -2715,14 +2911,22 @@ def _write_netcdf_variable( # filled before any data is written. if the fill value is # False then the variable is not pre-filled. # ------------------------------------------------------------ - if ( - omit_data or fill or g["post_dry_run"] - ): # or append mode's appending iteration - fill_value = self.implementation.get_property( - cfvar, "_FillValue", None - ) - else: - fill_value = None + match g["backend"]: + case "netCDF4": + if ( + omit_data or fill or g["post_dry_run"] + ): # or append mode's appending iteration + fill_value = self.implementation.get_property( + cfvar, "_FillValue", None + ) + else: + fill_value = None + + case "zarr": + # Set the `zarr` fill_value to the missing value of + # 'cfvar', defaulting to the netCDF default fill value + # if no missing value is available + fill_value = self._missing_value(cfvar, datatype) if data_variable: lsd = g["least_significant_digit"] @@ -2731,19 +2935,20 @@ def _write_netcdf_variable( # Set the dataset chunk strategy if chunking: - contiguous, chunksizes = chunking + contiguous, chunksizes, shards = chunking else: - contiguous, chunksizes = self._chunking_parameters( + contiguous, chunksizes, shards = self._chunking_parameters( data, ncdimensions ) logger.debug( - f" chunksizes: {chunksizes}\n" - f" contiguous: {contiguous}" + f" chunksizes: {chunksizes!r}, " + f"contiguous: {contiguous!r}, " + f"shards: {shards!r}" ) # pragma: no cover # ------------------------------------------------------------ - # Check that each dimension of the netCDF variable is in the + # Check that each dimension of the dataset variable is in the # same group or a parent group (CF>=1.8) # ------------------------------------------------------------ if g["group"]: @@ -2752,34 +2957,53 @@ def _write_netcdf_variable( ncdim_groups = self._groups(ncdim) if not groups.startswith(ncdim_groups): raise ValueError( - f"Can't create netCDF variable {ncvar!r} from " - f"{cfvar!r} with netCDF dimension {ncdim!r} that is " + f"Can't create variable {ncvar!r} from " + f"{cfvar!r} with dimension {ncdim!r} that is " "not in the same group nor in a parent group." ) # ------------------------------------------------------------ - # Replace netCDF dimension names with their basenames - # (CF>=1.8) - # ------------------------------------------------------------ - ncdimensions_basename = [ - self._remove_group_structure(ncdim) for ncdim in ncdimensions - ] - - # ------------------------------------------------------------ - # Create a new netCDF variable + # Create a new dataset variable # ------------------------------------------------------------ kwargs = { "varname": ncvar, "datatype": datatype, - "dimensions": ncdimensions_basename, "endian": g["endian"], "contiguous": contiguous, "chunksizes": chunksizes, + "shards": shards, "least_significant_digit": lsd, "fill_value": fill_value, "chunk_cache": g["chunk_cache"], } + # ------------------------------------------------------------ + # Replace dataset dimension names with their basenames + # (CF>=1.8) + # ------------------------------------------------------------ + if g["backend"] == "zarr": + # ... but not for Zarr. This is because the Zarr data + # model doesn't have the concept of dimensions belonging + # to a group (unlike netCDF), so by keeping the group + # structure in the dimension names we can know which group + # they belong to. + kwargs["dimensions"] = ncdimensions + else: + ncdimensions_basename = [ + self._remove_group_structure(ncdim) for ncdim in ncdimensions + ] + kwargs["dimensions"] = ncdimensions_basename + + if data is not None: + compressed = self._compressed_data(ncdimensions) + if compressed: + # Write data in its compressed form + shape = data.source().source().shape + else: + shape = data.shape + + kwargs["shape"] = shape + # ------------------------------------------------------------ # Create a quantization container variable, add any extra # quantization attributes, and if required instruct @@ -2810,11 +3034,17 @@ def _write_netcdf_variable( q, netcdf_parameter, None ) - # Create a quantization container variable in the file, if - # it doesn't already exist (and after having removed any - # per-variable quantization parameters, such as + # Create a quantization container variable in the dataset, + # if it doesn't already exist (and after having removed + # any per-variable quantization parameters, such as # "quantization_nsd"). if quantize_on_write: + if g["backend"] == "zarr": + raise NotImplementedError( + f"Can't yet quantize-on-write {cfvar!r} to a Zarr " + "dataset" + ) + # Set "implemention" to this version of the netCDF-C # library import netCDF4 @@ -2862,7 +3092,7 @@ def _write_netcdf_variable( if g["fmt"] not in NETCDF4_FMTS: raise ValueError( f"Can't quantize {cfvar!r} into a {g['fmt']} " - "format file. Quantization is only possible when " + "format dataset. Quantization is only possible when " f"writing to one of the {NETCDF4_FMTS} formats." ) @@ -2908,7 +3138,7 @@ def _write_netcdf_variable( if g["cfa"].get("strict", True): # Raise the exception in 'strict' mode if g["mode"] == "w": - os.remove(g["filename"]) + self.dataset_remove() raise @@ -2921,10 +3151,12 @@ def _write_netcdf_variable( # keyword arguments. This is necessary because the # dimensions and dataset chunking strategy will # otherwise reflect the aggregated data in memory, - # rather than the scalar variable in the file. - kwargs["dimensions"] = () + # rather than the scalar variable in the dataset. kwargs["contiguous"] = True kwargs["chunksizes"] = None + kwargs["dimensions"] = () + kwargs["shape"] = () + kwargs["shards"] = None # Add compression parameters (but not for scalars or vlen # strings). @@ -2945,52 +3177,30 @@ def _write_netcdf_variable( ) logger.info( - f" to netCDF variable: {ncvar}({', '.join(ncdimensions)})" + f" dimensions: ({', '.join(ncdimensions)})" ) # pragma: no cover - # Adjust createVariable arguments for contiguous variables - if kwargs["contiguous"]: - if g["netcdf"].data_model.startswith("NETCDF4"): - # NETCDF4 contiguous variables can't span unlimited - # dimensions - unlimited_dimensions = g["unlimited_dimensions"].intersection( - kwargs["dimensions"] - ) - if unlimited_dimensions: - data_model = g["netcdf"].data_model - raise ValueError( - f"Can't create variable {ncvar!r} in {data_model} " - f"file from {cfvar!r}: In {data_model} it is not " - "allowed to write contiguous (as opposed to chunked) " - "data that spans one or more unlimited dimensions: " - f"{unlimited_dimensions}" - ) - - # NETCDF4 contiguous variables can't be compressed - kwargs["compression"] = None - kwargs["complevel"] = 0 - try: self._createVariable(**kwargs) except RuntimeError as error: error = str(error) message = ( - f"Can't create variable in {g['netcdf'].data_model} file " + f"Can't create variable in {g['netcdf'].data_model} dataset " f"from {cfvar!r}: {error}. " - f"netCDF4.createVariable arguments: {kwargs}" + f"_createVariable arguments: {kwargs}" ) if error == ( "NetCDF: Not a valid data type or _FillValue type mismatch" ): raise ValueError( f"Can't write {cfvar.data.dtype.name} data from {cfvar!r} " - f"to a {g['netcdf'].data_model} file. " + f"to a {g['netcdf'].data_model} dataset. " "Consider using a netCDF4 format, or use the 'datatype' " "parameter, or change the datatype before writing." ) elif error == "NetCDF: NC_UNLIMITED in the wrong index": raise RuntimeError( - f"{message}. In a {g['netcdf'].data_model} file the " + f"{message}. In a {g['netcdf'].data_model} dataset the " "unlimited dimension must be the first (leftmost) " "dimension of the variable. " "Consider using a netCDF4 format." @@ -2999,14 +3209,14 @@ def _write_netcdf_variable( raise RuntimeError(message) # ------------------------------------------------------------ - # Write attributes to the netCDF variable + # Write attributes to the dataset variable # ------------------------------------------------------------ - attributes = self._write_attributes( + attributes = self._write_variable_attributes( cfvar, ncvar, extra=extra, omit=omit ) # ------------------------------------------------------------ - # Write data to the netCDF variable + # Write data to the dataset variable # # Note that we don't need to worry about scale_factor and # add_offset, since if a data array is *not* a numpy array, @@ -3046,7 +3256,7 @@ def _write_netcdf_variable( def _customise_createVariable( self, cfvar, construct_type, domain_axes, kwargs ): - """Customises `netCDF4.Dataset.createVariable` keywords. + """Customises `_createVariable` keywords. The keyword arguments may be changed in subclasses which override this method. @@ -3074,13 +3284,11 @@ def _customise_createVariable( `dict` Dictionary of keyword arguments to be passed to - `netCDF4.Dataset.createVariable`. + `_createVariable`. """ # This method is trivial but the intention is that subclasses - # will override it to perform any desired - # customisation. Notably see the equivalent method in - # cf-python which is non-trivial. + # may override it to perform any desired customisation. return kwargs def _transform_strings(self, data, ncdimensions): @@ -3112,6 +3320,7 @@ def _transform_strings(self, data, ncdimensions): array = self._numpy_compressed(array) strlen = len(max(array, key=len)) + del array data = self._convert_to_char(data) ncdim = self._string_length_dimension(strlen) @@ -3133,11 +3342,11 @@ def _write_data( construct_type=None, cfa=None, ): - """Write a data array to the netCDF file. + """Write a data array to the dataset. :Parameters: - data: Data instance + data: `Data` instance cfvar: cfdm instance @@ -3153,8 +3362,8 @@ def _write_data( unset_values: sequence of numbers attributes: `dict`, optional - The netCDF attributes for the constructs that have been - written to the file. + The dataset attributes for the constructs that have + been written to the dataset. construct_type: `str` The construct type of the *cfvar*, or its parent if @@ -3187,6 +3396,8 @@ def _write_data( # ------------------------------------------------------------ import dask.array as da + zarr = g["backend"] == "zarr" + if compressed: # Write data in its compressed form data = data.source().source() @@ -3209,6 +3420,24 @@ def _write_data( meta=np.array((), dx.dtype), ) + # Initialise the dataset lock for the data writing from Dask + lock = None + + # Rechunk the Dask array to shards, if applicable. + if zarr: + # When a Zarr variable is sharded, the Dask array must be + # rechunked to the shards because "when writing data, a + # full shard must be written in one go for optimal + # performance and to avoid concurrency issues." + # https://zarr.readthedocs.io/en/stable/user-guide/arrays.html#sharding + shards = g["nc"][ncvar].shards + if shards is not None: + dx = dx.rechunk(shards) + # This rechunking has aligned Dask chunk boundaries + # with Zarr chunk boundaries, so we don't need to lock + # the write. + lock = False + # Check for out-of-range values if g["warn_valid"]: if construct_type: @@ -3223,7 +3452,47 @@ def _write_data( meta=np.array((), dx.dtype), ) - da.store(dx, g["nc"][ncvar], compute=True, return_stored=False) + if zarr: + # `zarr` can't write a masked array to a variable, so we + # have to manually replace missing data with the fill + # value. + dx = dx.map_blocks( + self._filled_array, + meta=np.array((), dx.dtype), + fill_value=g["nc"][ncvar].fill_value, + ) + + if lock is None: + # We need to define the dataset lock for data writing from + # Dask + from cfdm.data.locks import netcdf_lock as lock + + da.store( + dx, g["nc"][ncvar], compute=True, return_stored=False, lock=lock + ) + + def _filled_array(self, array, fill_value): + """Replace masked values with a fill value. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + array: `numpy.ndarray` + The arry to be filled. + + fill_value: + The fill value. + + :Returns: + + `numpy.ndarray` + + """ + if np.ma.isMA(array): + return array.filled(fill_value) + + return array def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. @@ -3283,7 +3552,7 @@ def _check_valid(self, array, cfvar=None, attributes=None): print( message.format( cfvar, - self.write_vars["filename"], + self.write_vars["dataset_name"], "less", "minimum", prop, @@ -3306,7 +3575,7 @@ def _check_valid(self, array, cfvar=None, attributes=None): print( message.format( cfvar, - self.write_vars["filename"], + self.write_vars["dataset_name"], "greater", "maximum", prop, @@ -3346,7 +3615,7 @@ def _convert_to_char(self, data): def _write_field_or_domain( self, f, add_to_seen=False, allow_data_insert_dimension=True ): - """Write a field or domain construct to the file. + """Write a field or domain construct to the dataset. All of the metadata constructs are also written. @@ -3368,6 +3637,7 @@ def _write_field_or_domain( import re g = self.write_vars + ncdim_size_to_spanning_constructs = [] seen = g["seen"] @@ -3402,30 +3672,30 @@ def _write_field_or_domain( # axes that define the domain. CF-1.9 data_axes = list(self.implementation.get_domain_axes(f)) - # Mapping of domain axis identifiers to netCDF dimension + # Mapping of domain axis identifiers to dataset dimension # names. This gets reset for each new field/domain that is - # written to the file. + # written to the dataset. # # For example: {'domainaxis1': 'lon'} g["axis_to_ncdim"] = {} - # Mapping of domain axis identifiers to netCDF scalar + # Mapping of domain axis identifiers to dataset scalar # coordinate variable names. This gets reset for each new - # field/domain that is written to the file. + # field/domain that is written to the dataset. # # For example: {'domainaxis0': 'time'} g["axis_to_ncscalar"] = {} - # Mapping of construct internal identifiers to netCDF variable - # names. This gets reset for each new field/domain that is - # written to the file. + # Mapping of construct internal identifiers to dataset + # variable names. This gets reset for each new field/domain + # that is written to the dataset. # # For example: {'dimensioncoordinate1': 'longitude'} g["key_to_ncvar"] = {} - # Mapping of construct internal identifiers to their netCDF + # Mapping of construct internal identifiers to their dataset # dimensions. This gets reset for each new field/domain that - # is written to the file. + # is written to the dataset. # # For example: {'dimensioncoordinate1': ['longitude']} g["key_to_ncdims"] = {} @@ -3481,7 +3751,7 @@ def _write_field_or_domain( ugrid = self.implementation.has_domain_topology(f) if ugrid: raise NotImplementedError( - "Can't yet create UGRID cf-netCDF files. " + "Can't yet write UGRID datasets. " "This feature is coming soon ..." ) @@ -3575,7 +3845,7 @@ def _write_field_or_domain( # ---------------------------------------------------- if axis in data_axes: # The data array spans this domain axis, so write - # the dimension coordinate to the file as a + # the dimension coordinate to the dataset as a # coordinate variable. ncvar = self._write_dimension_coordinate( f, key, dim_coord, ncdim=ncdim, coordinates=coordinates @@ -3596,7 +3866,7 @@ def _write_field_or_domain( # auxiliary coordinates, cell measures, domain # ancillaries or field ancillaries which span # this domain axis. Therefore write the - # dimension coordinate to the file as a + # dimension coordinate to the dataset as a # coordinate variable. ncvar = self._write_dimension_coordinate( f, @@ -3618,8 +3888,8 @@ def _write_field_or_domain( # coordinates, cell measures, domain # ancillaries or field ancillaries which span # this domain axis. Therefore write the - # dimension coordinate to the file as a scalar - # coordinate variable. + # dimension coordinate to the dataset as a + # scalar coordinate variable. coordinates = self._write_scalar_coordinate( f, key, dim_coord, axis, coordinates ) @@ -3660,7 +3930,7 @@ def _write_field_or_domain( data_axes.append(axis) # If the data array (now) spans this domain axis then - # create a netCDF dimension for it + # create a dataset dimension for it if axis in data_axes: axis_size0 = self.implementation.get_domain_axis_size( f, axis @@ -3727,7 +3997,7 @@ def _write_field_or_domain( and len(data_axes) == 2 and axis == data_axes[1] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][axis] = "ragged_contiguous_element" elif ( @@ -3735,7 +4005,7 @@ def _write_field_or_domain( and len(data_axes) == 2 and axis == data_axes[1] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][axis] = "ragged_indexed_element" elif ( @@ -3743,7 +4013,7 @@ def _write_field_or_domain( and len(data_axes) == 3 and axis == data_axes[1] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][ axis @@ -3753,7 +4023,7 @@ def _write_field_or_domain( and len(data_axes) == 3 and axis == data_axes[2] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][ axis @@ -3767,11 +4037,12 @@ def _write_field_or_domain( ) if not g["group"]: - # A flat file has been requested, so strip - # off any group structure from the name. + # A flat dataset has been requested, so + # strip off any group structure from the + # name. ncdim = self._remove_group_structure(ncdim) - ncdim = self._netcdf_name(ncdim) + ncdim = self._name(ncdim) unlimited = self._unlimited(f, axis) self._write_dimension( @@ -3808,8 +4079,8 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by gathering # - # Write the list variable to the file, making a note - # of the netCDF sample dimension. + # Write the list variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- list_variable = self.implementation.get_list(f) compress = " ".join(compressed_ncdims) @@ -3821,8 +4092,8 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by contiguous ragged array # - # Write the count variable to the file, making a note - # of the netCDF sample dimension. + # Write the count variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- count = self.implementation.get_count(f) sample_ncdim = self._write_count_variable( @@ -3833,16 +4104,16 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by indexed ragged array # - # Write the index variable to the file, making a note - # of the netCDF sample dimension. + # Write the index variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- index = self.implementation.get_index(f) index_ncdim = self.implementation.nc_get_dimension( index, default="sample" ) if not g["group"]: - # A flat file has been requested, so strip off any - # group structure from the name. + # A flat dataset has been requested, so strip off + # any group structure from the name. index_ncdim = self._remove_group_structure(index_ncdim) sample_ncdim = self._write_index_variable( @@ -3858,8 +4129,8 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by indexed contigous ragged array # - # Write the index variable to the file, making a note - # of the netCDF sample dimension. + # Write the index variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- count = self.implementation.get_count(f) count_ncdim = self.implementation.nc_get_dimension( @@ -3867,8 +4138,8 @@ def _write_field_or_domain( ) if not g["group"]: - # A flat file has been requested, so strip off any - # group structure from the name. + # A flat dataset has been requested, so strip off + # any group structure from the name. count_ncdim = self._remove_group_structure(count_ncdim) sample_ncdim = self._write_count_variable( @@ -3876,7 +4147,7 @@ def _write_field_or_domain( ) if not g["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. sample_ncdim = self._remove_group_structure(sample_ncdim) @@ -3946,7 +4217,7 @@ def _write_field_or_domain( ) # ------------------------------------------------------------ - # Create netCDF variables from domain ancillaries + # Create dataset variables from domain ancillaries # ------------------------------------------------------------ for key, anc in sorted( self.implementation.get_domain_ancillaries(f).items() @@ -3954,7 +4225,7 @@ def _write_field_or_domain( self._write_domain_ancillary(f, key, anc) # ------------------------------------------------------------ - # Create netCDF variables from cell measures + # Create dataset variables from cell measures # ------------------------------------------------------------ # Set the list of 'cell_measures' attribute values (each of # the form 'measure: name') @@ -3966,7 +4237,7 @@ def _write_field_or_domain( ] # ------------------------------------------------------------ - # Create netCDF formula_terms attributes from vertical + # Create formula_terms dataset attributes from vertical # coordinate references # ------------------------------------------------------------ for ref in g["formula_terms_refs"]: @@ -4040,8 +4311,9 @@ def _write_field_or_domain( if id(domain_anc) not in seen: continue - # Get the netCDF variable name for the domain - # ancillary and add it to the formula_terms attribute + # Get the dataset variable name for the domain + # ancillary and add it to the formula_terms + # attribute ncvar = seen[id(domain_anc)]["ncvar"] formula_terms.append(f"{term}: {ncvar}") @@ -4064,15 +4336,15 @@ def _write_field_or_domain( formula_terms = " ".join(formula_terms) if not g["dry_run"] and not g["post_dry_run"]: try: - g["nc"][ncvar].setncattr( - "formula_terms", formula_terms + self._set_attributes( + {"formula_terms": formula_terms}, ncvar ) except KeyError: pass # TODO convert to 'raise' via fixes upstream logger.info( - " Writing formula_terms attribute to " - f"netCDF variable {ncvar}: {formula_terms!r}" + " Writing formula_terms attribute to variable " + f"{ncvar}: {formula_terms!r}" ) # pragma: no cover # Add the formula_terms attribute to the parent @@ -4082,14 +4354,15 @@ def _write_field_or_domain( bounds_formula_terms = " ".join(bounds_formula_terms) if not g["dry_run"] and not g["post_dry_run"]: try: - g["nc"][bounds_ncvar].setncattr( - "formula_terms", bounds_formula_terms + self._set_attributes( + {"formula_terms": bounds_formula_terms}, + bounds_ncvar, ) except KeyError: pass # TODO convert to 'raise' via fixes upstream logger.info( - " Writing formula_terms to netCDF bounds variable " + " Writing formula_terms to bounds variable " f"{bounds_ncvar}: {bounds_formula_terms!r}" ) # pragma: no cover @@ -4098,7 +4371,7 @@ def _write_field_or_domain( self._create_vertical_datum(ref, owning_coord_key) # ------------------------------------------------------------ - # Create netCDF variables grid mappings + # Create dataset grid mapping variables # ------------------------------------------------------------ multiple_grid_mappings = len(g["grid_mapping_refs"]) > 1 @@ -4110,8 +4383,8 @@ def _write_field_or_domain( # ------------------------------------------------------------ # Field ancillary variables # - # Create the 'ancillary_variables' CF-netCDF attribute and - # create the referenced CF-netCDF ancillary variables + # Create the 'ancillary_variables' CF attribute and create the + # referenced dataset ancillary variables # ------------------------------------------------------------ if field: ancillary_variables = [ @@ -4122,14 +4395,14 @@ def _write_field_or_domain( ] # ------------------------------------------------------------ - # Create the CF-netCDF data/domain variable + # Create the data/domain dataset variable # ------------------------------------------------------------ if field: default = "data" else: default = "domain" - ncvar = self._create_netcdf_variable_name(f, default=default) + ncvar = self._create_variable_name(f, default=default) ncdimensions = data_ncdimensions @@ -4138,9 +4411,9 @@ def _write_field_or_domain( # Cell measures if cell_measures: cell_measures = " ".join(cell_measures) - logger.info( + logger.debug( " Writing cell_measures attribute to " - f"netCDF variable {ncvar}: {cell_measures!r}" + f"variable {ncvar}: {cell_measures!r}" ) # pragma: no cover extra["cell_measures"] = cell_measures @@ -4150,7 +4423,7 @@ def _write_field_or_domain( coordinates = " ".join(coordinates) logger.info( " Writing coordinates attribute to " - f"netCDF variable {ncvar}: {coordinates!r}" + f"variable {ncvar}: {coordinates!r}" ) # pragma: no cover extra["coordinates"] = coordinates @@ -4160,7 +4433,7 @@ def _write_field_or_domain( grid_mapping = " ".join(grid_mapping) logger.info( " Writing grid_mapping attribute to " - f"netCDF variable {ncvar}: {grid_mapping!r}" + f"variable {ncvar}: {grid_mapping!r}" ) # pragma: no cover extra["grid_mapping"] = grid_mapping @@ -4171,7 +4444,7 @@ def _write_field_or_domain( ancillary_variables = re.sub(r"\s+", " ", ancillary_variables) logger.info( " Writing ancillary_variables attribute to " - f"netCDF variable {ncvar}: {ancillary_variables!r}" + f"variable {ncvar}: {ancillary_variables!r}" ) # pragma: no cover extra["ancillary_variables"] = ancillary_variables @@ -4208,7 +4481,7 @@ def _write_field_or_domain( cell_methods = " ".join(cell_methods_strings) logger.info( " Writing cell_methods attribute to " - f"netCDF variable {ncvar}: {cell_methods}" + f"variable {ncvar}: {cell_methods}" ) # pragma: no cover extra["cell_methods"] = cell_methods @@ -4225,7 +4498,7 @@ def _write_field_or_domain( extra["geometry"] = gc_ncvar # ------------------------------------------------------------ - # Create a new CF-netCDF data/domain variable + # Create a new data/domain dataset variable # ------------------------------------------------------------ # Omit any global attributes from the variable omit = g["global_attributes"] @@ -4332,8 +4605,8 @@ def _create_vertical_datum(self, ref, coord_key): ncvar = self.implementation.nc_get_variable(datum) if ncvar is not None: if not self.write_vars["group"]: - # A flat file has been requested, so strip off any - # group structure from the name. + # A flat dataset has been requested, so strip off + # any group structure from the name. ncvar = self._remove_group_structure(ncvar) self.implementation.nc_set_variable(new_grid_mapping, ncvar) @@ -4360,26 +4633,8 @@ def _unlimited(self, field, axis): """ return self.implementation.nc_is_unlimited_axis(field, axis) - def _write_group(self, parent_group, group_name): - """Creates a new netCDF4 parent group object. - - .. versionadded:: (cfdm) 1.8.6.0 - - :Parameters: - - parent_group: `netCDF4.Dateset` or `netCDF4._netCDF4.Group` - - group_name: `str` - - :Returns: - - `netCDF4._netCDF4.Group` - - """ - return parent_group.createGroup(group_name) - def _write_group_attributes(self, fields): - """Writes the netCDF group-level attributes to the file. + """Writes the group-level attributes to the dataset. :Parameters: @@ -4428,7 +4683,7 @@ def _write_group_attributes(self, fields): break # -------------------------------------------------------- - # Write the group-level attributes to the file + # Write the group-level attributes to the dataset # -------------------------------------------------------- # Replace None values with actual values for attr, value in this_group_attributes.items(): @@ -4439,25 +4694,59 @@ def _write_group_attributes(self, fields): f0, attr ) - nc = g["netcdf"] - for group in groups: - if group in nc.groups: - nc = nc.groups[group] - else: - nc = self._create_netcdf_group(nc, group) + nc = self._get_group(g["dataset"], groups) if not g["dry_run"]: - nc.setncatts(this_group_attributes) + self._set_attributes(this_group_attributes, group=nc) group_attributes[groups] = tuple(this_group_attributes) g["group_attributes"] = group_attributes + def _get_group(self, parent, groups): + """Get the group of *parent* defined by *groups*. + + The group will be created if it doesn't already exist. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + parent: `netCDF4.Dateset` or `netCDF4.Group` or `Zarr.Group` + The group in which to find or create new group. + + groups: sequence of `str` + The group defined by the sequence of its subgroups + relative to *parent*, e.g. ``('forecast', 'model')``. + + :Returns: + + `netCDF4.Group` or `Zarr.Group` + The group. + + """ + match self.write_vars["backend"]: + case "netCDF4": + for group in groups: + if group in parent.groups: + parent = parent.groups[group] + else: + parent = self._createGroup(parent, group) + + case "zarr": + group = "/".join(groups) + if group in parent: + parent = parent[group] + else: + parent = self._createGroup(parent, group) + + return parent + def _write_global_attributes(self, fields): - """Writes all netCDF global properties to the netCDF4 dataset. + """Writes all global properties to the dataset. - Specifically, finds the netCDF global properties from all of - the input fields and writes them to the `netCDF4.Dataset`. + Specifically, finds the global properties from all of the + input fields and writes them to the root group of the dataset. :Parameters: @@ -4555,7 +4844,7 @@ def _write_global_attributes(self, fields): break # ----------------------------------------------------------- - # Write the Conventions global attribute to the file + # Write the Conventions global attribute to the dataset # ------------------------------------------------------------ delimiter = " " set_Conventions = force_global.pop("Conventions", None) @@ -4594,34 +4883,78 @@ def _write_global_attributes(self, fields): delimiter = "," if not g["dry_run"] and not g["post_dry_run"]: - g["netcdf"].setncattr( - "Conventions", delimiter.join(g["Conventions"]) - ) + attrs = {"Conventions": delimiter.join(g["Conventions"])} # ------------------------------------------------------------ - # Write the file descriptors to the file + # Write the file descriptors to the dataset # ------------------------------------------------------------ - for attr, value in g["file_descriptors"].items(): - g["netcdf"].setncattr(attr, value) + attrs.update(g["file_descriptors"]) # ------------------------------------------------------------ - # Write other global attributes to the file + # Write other global attributes to the dataset # ------------------------------------------------------------ - for attr in global_attributes - set(("Conventions",)): - g["netcdf"].setncattr( - attr, self.implementation.get_property(f0, attr) - ) + attrs.update( + { + attr: self.implementation.get_property(f0, attr) + for attr in global_attributes - set(("Conventions",)) + } + ) # ------------------------------------------------------------ - # Write "forced" global attributes to the file + # Write "forced" global attributes to the dataset # ------------------------------------------------------------ - for attr, v in force_global.items(): - g["netcdf"].setncattr(attr, v) + attrs.update(force_global) + + self._set_attributes(attrs, group=g["dataset"]) g["global_attributes"] = global_attributes - def file_close(self, filename): - """Close the netCDF file that has been written. + def dataset_exists(self, dataset): + """Whether or not a dataset exists on disk. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `str` + The name of the dataset. + + :Returns: + + `bool` + Whether or not the dataset exists on disk. + + """ + match self.write_vars["dataset_type"]: + case "file": + return os.path.isfile(dataset) + + case "directory": + return os.path.isdir(dataset) + + def dataset_remove(self): + """Remove the dataset that is being created. + + .. note:: If the dataset is a directory, then it is silently + not removed. To do so could be very dangerous (what + if it were your home space?). + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `None` + + """ + g = self.write_vars + match g["dataset_type"]: + case "file": + os.remove(g["dataset_name"]) + case "directory": + pass + + def dataset_close(self): + """Close the dataset that has been written. .. versionadded:: (cfdm) 1.7.0 @@ -4630,61 +4963,87 @@ def file_close(self, filename): `None` """ - self.write_vars["netcdf"].close() + g = self.write_vars + if g["backend"] == "netCDF4": + g["dataset"].close() - def file_open(self, filename, mode, fmt, fields): - """Open the netCDF file for writing. + def dataset_open(self, dataset_name, mode, fmt, fields): + """Open the dataset for writing. .. versionadded:: (cfdm) 1.7.0 :Parameters: - filename: `str` - As for the *filename* parameter for initialising a - `netCDF.Dataset` instance. + dataset_name: `str` + The dataset to open. mode: `str` As for the *mode* parameter for initialising a - `netCDF.Dataset` instance. + `netCDF4.Dataset` instance. fmt: `str` As for the *format* parameter for initialising a - `netCDF.Dataset` instance. + `netCDF4.Dataset` instance. Ignored for Zarr datasets. fields: sequence of `Field` or `Domain` - The constructs to be written to the netCDF file. Note - that these constructs are only used to ascertain if - any data to be written is in *filename*. If this is + The constructs to be written to the dataset. Note that + these constructs are only used to ascertain if any + data to be written is in *dataset_name*. If this is the case and mode is "w" then an exception is raised - to prevent *filename* from being deleted. + to prevent *dataset_name* from being deleted. :Returns: - `netCDF.Dataset` - A `netCDF4.Dataset` object for the file. + `netCDF.Dataset` or `zarr.Group` """ import netCDF4 if fields and mode == "w": - filename = os.path.abspath(filename) + dataset_name = os.path.abspath(dataset_name) for f in fields: - if filename in self.implementation.get_original_filenames(f): + if dataset_name in self.implementation.get_original_filenames( + f + ): raise ValueError( - "Can't write with mode 'w' to a file that contains " - f"data that needs to be read: {f!r} uses {filename}" + "Can't write with mode 'w' to a dataset that contains " + f"data which needs to be read: {f!r} uses " + f"{dataset_name}" ) - # mode == 'w' is safer than != 'a' in case of a typo (the letters - # are neighbours on a QWERTY keyboard) since 'w' is destructive. - # Note that for append ('a') mode the original file is never wiped. - if mode == "w" and self.write_vars["overwrite"]: - os.remove(filename) + g = self.write_vars - try: - nc = netCDF4.Dataset(filename, mode, format=fmt) - except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") + # mode == 'w' is safer than != 'a' in case of a typo (the + # letters are neighbours on a QWERTY keyboard) since 'w' is + # destructive. Note that for append ('a') mode the original + # dataset is never wiped. + if mode == "w" and g["overwrite"]: + self.dataset_remove() + + match g["backend"]: + case "netCDF4": + try: + nc = netCDF4.Dataset(dataset_name, mode, format=fmt) + except RuntimeError as error: + raise RuntimeError(f"{error}: {dataset_name}") + + case "zarr": + try: + import zarr + except ModuleNotFoundError as error: + error.msg += ( + ". Install the 'zarr' package " + "(https://pypi.org/project/zarr) to write Zarr " + "datasets" + ) + raise + + nc = zarr.create_group( + dataset_name, + overwrite=g["overwrite"], + zarr_format=3, + storage_options=g.get("storage_options"), + ) return nc @@ -4692,7 +5051,7 @@ def file_open(self, filename, mode, fmt, fields): def write( self, fields, - filename, + dataset_name, fmt="NETCDF4", mode="w", overwrite=True, @@ -4717,53 +5076,51 @@ def write( coordinates=False, omit_data=None, dataset_chunks="4MiB", + dataset_shards=None, cfa="auto", reference_datetime=None, ): - """Write field and domain constructs to a netCDF file. + """Write field and domain constructs to a dataset. - NetCDF dimension and variable names will be taken from - variables' `ncvar` attributes and the field attribute - `!ncdimensions` if present, otherwise they are inferred from - standard names or set to defaults. NetCDF names may be - automatically given a numerical suffix to avoid duplication. + Output global properties are those which occur in the set of + CF global properties and non-standard data variable properties + and which have equal values across all input fields. - Output netCDF file global properties are those which occur in the set - of CF global properties and non-standard data variable properties and - which have equal values across all input fields. - - Logically identical field components are only written to the file - once, apart from when they need to fulfil both dimension coordinate - and auxiliary coordinate roles for different data variables. + Logically identical field components are only written to the + datset once, apart from when they need to fulfil both + dimension coordinate and auxiliary coordinate roles for + different data variables. .. versionadded:: (cfdm) 1.7.0 :Parameters: fields : (sequence of) `cfdm.Field` - The field or fields to write to the file. + The field or fields to write to the dataset. See `cfdm.write` for details. - filename: str - The output CF-netCDF file. + dataset_name: str + The output dataset. See `cfdm.write` for details. mode: `str`, optional - Specify the mode of write access for the output file. One of: + Specify the mode of write access for the output + dataset. One of: ======== ================================================= *mode* Description ======== ================================================= - ``'w'`` Open a new file for writing to. If it exists and - *overwrite* is True then the file is deleted - prior to being recreated. - ``'a'`` Open an existing file for appending new + ``'w'`` Open a new dataset for writing to. If it + exists and *overwrite* is True then the + dataset is deleted prior to being recreated. + + ``'a'`` Open an existing dataset for appending new information to. The new information will be incorporated whilst the original contents of the - file will be preserved. + dataset will be preserved. In practice this means that new fields will be created, whilst the original fields will not be @@ -4772,7 +5129,7 @@ def write( For append mode, note the following: - * Global attributes on the file + * Global attributes on the dataset will remain the same as they were originally, so will become inaccurate where appended fields have incompatible attributes. To rectify this, @@ -4782,7 +5139,7 @@ def write( `nc_set_global_attribute`. * Fields with incompatible ``featureType`` to - the original file cannot be appended. + the original dataset cannot be appended. * At present fields with groups cannot be appended, but this will be possible in a future @@ -4802,13 +5159,13 @@ def write( ======== ================================================= - By default the file is opened with write access mode - ``'w'``. + By default the dataset is opened with write access + mode ``'w'``. overwrite: bool, optional - If False then raise an exception if the output file - pre-exists. By default a pre-existing output file is - over written. + If False then raise an exception if the output dataset + pre-exists. By default a pre-existing output dataset + is over written. See `cfdm.write` for details. @@ -4816,8 +5173,9 @@ def write( See `cfdm.write` for details. file_descriptors: `dict`, optional - Create description of file contents netCDF global - attributes from the specified attributes and their values. + Create description of dataset contents netCDF global + attributes from the specified attributes and their + values. See `cfdm.write` for details. @@ -4835,9 +5193,9 @@ def write( See `cfdm.write` for details. external: `str`, optional - Write metadata constructs that have data and are marked as - external to the named external file. Ignored if there are - no such constructs. + Write metadata constructs that have data and are + marked as external to the named external + dataset. Ignored if there are no such constructs. See `cfdm.write` for details. @@ -4854,7 +5212,8 @@ def write( See `cfdm.write` for details. endian: `str`, optional - The endian-ness of the output file. + The endian-ness of the output dataset. Ignored for + Zarr datasets. See `cfdm.write` for details. @@ -4864,14 +5223,15 @@ def write( See `cfdm.write` for details. least_significant_digit: `int`, optional - Truncate the input field construct data arrays, but not - the data arrays of metadata constructs. + Truncate the input field construct data arrays, but + not the data arrays of metadata constructs. Ignored + for Zarr datasets. See `cfdm.write` for details. chunk_cache: `int` or `None`, optional - The amount of memory (in bytes) used in each - variable's chunk cache at the HDF5 level. + The amount of memory (in bytes) used in each HDF5 + variable's chunk cache. Ignored for Zarr datasets. See `cfdm.write` for details. @@ -4880,29 +5240,31 @@ def write( fletcher32: `bool`, optional If True then the Fletcher-32 HDF5 checksum algorithm is activated to detect compression errors. Ignored if - *compress* is ``0``. + *compress* is ``0``. Ignored for Zarr datasets. See `cfdm.write` for details. shuffle: `bool`, optional - If True (the default) then the HDF5 shuffle filter (which - de-interlaces a block of data before compression by - reordering the bytes by storing the first byte of all of a - variable's values in the chunk contiguously, followed by - all the second bytes, and so on) is turned off. + If True (the default) then the HDF5 shuffle filter + (which de-interlaces a block of data before + compression by reordering the bytes by storing the + first byte of all of a variable's values in the chunk + contiguously, followed by all the second bytes, and so + on) is turned off. Ignored for Zarr datasets. See `cfdm.write` for details. string: `bool`, optional By default string-valued construct data are written as - netCDF arrays of type string if the output file format is - ``'NETCDF4'``, or of type char with an extra dimension - denoting the maximum string length for any other output - file format (see the *fmt* parameter). If *string* is False - then string-valued construct data are written as netCDF - arrays of type char with an extra dimension denoting the - maximum string length, regardless of the selected output - file format. + netCDF arrays of type string if the output dataset + format is ``'NETCDF4'`` or ``'ZARR3'``, or of type + char with an extra dimension denoting the maximum + string length for any other output dataset format (see + the *fmt* parameter). If *string* is False then + string-valued construct data are written as netCDF + arrays of type char with an extra dimension denoting + the maximum string length, regardless of the selected + output dataset format. See `cfdm.write` for details. @@ -4918,7 +5280,7 @@ def write( The consequence of writing out-of-range data values is that, by default, these values will be masked when the - file is subsequently read. + dataset is subsequently read. *Parameter example:* If a construct has ``valid_max`` property with value @@ -4929,9 +5291,9 @@ def write( .. versionadded:: (cfdm) 1.8.3 group: `bool`, optional - If False then create a "flat" netCDF file, i.e. one with - only the root group, regardless of any group structure - specified by the field constructs. + If False then create a "flat" netCDF dataset, i.e. one + with only the root group, regardless of any group + structure specified by the field constructs. See `cfdm.write` for details. @@ -4957,6 +5319,12 @@ def write( The dataset chunking strategy. The default value is "4MiB". See `cfdm.write` for details. + dataset_shards: `int` or `None`, optional + The Zarr dataset sharding strategy. The default value + is `None`. See `cfdm.write` for details. + + .. versionadded:: (cfdm) NEXTVERSION + cfa: `dict` or `None`, optional Configure the creation of aggregation variables. See `cfdm.write` for details. @@ -4976,9 +5344,9 @@ def write( logger.info(f"Writing to {fmt}") # pragma: no cover - # Expand file name - filename = os.path.expanduser(os.path.expandvars(filename)) - filename = abspath(filename) + # Expand dataset name + dataset_name = os.path.expanduser(os.path.expandvars(dataset_name)) + dataset_name = abspath(dataset_name) # Parse the 'omit_data' parameter if omit_data is None: @@ -5000,11 +5368,15 @@ def write( # Initialise netCDF write parameters # ------------------------------------------------------------ self.write_vars = { - "filename": filename, - # Format of output file + "dataset_name": dataset_name, + # Format of output dataset "fmt": None, + # Backend for writing to the dataset + "backend": None, + # Whether the output datset is a file or a directory + "dataset_type": None, # netCDF4.Dataset instance - "netcdf": None, + # "netcdf": None, # Map netCDF variable names to netCDF4.Variable instances "nc": {}, # Map netCDF dimension names to netCDF dimension sizes @@ -5038,8 +5410,8 @@ def write( ), # Data type conversions to be applied prior to writing "datatype": {}, - # Whether or not to write string data-types to netCDF4 - # files (as opposed to car data-types). + # Whether or not to write string data-types to the output + # dataset (as opposed to char data-types). "string": string, # Conventions "Conventions": Conventions, @@ -5067,7 +5439,8 @@ def write( # dimensions keyed by items of the field (such as a # coordinate or a coordinate reference) "seen": {}, - # Dry run: populate 'seen' dict without actually writing to file. + # Dry run: populate 'seen' dict without actually writing + # to dataset. "dry_run": False, # To indicate if the previous iteration was a dry run: # @@ -5086,18 +5459,19 @@ def write( # -------------------------------------------------------- # Configuration options for writing aggregation variables "cfa": None, - # The directory of the aggregation file + # The directory of the aggregation dataset "aggregation_file_directory": None, # Cache the CF aggregation variable write status for each - # netCDF variable + # dataset variable "cfa_write_status": {}, # -------------------------------------------------------- - # Dataset chunking stategy + # Dataset chunking and sharding stategy # -------------------------------------------------------- "dataset_chunks": dataset_chunks, + "dataset_shards": dataset_shards, # -------------------------------------------------------- # Quantization: Store unique Quantization objects, keyed - # by their output netCDF variable names. + # by their output dataset variable names. # -------------------------------------------------------- "quantization": {}, } @@ -5124,6 +5498,14 @@ def write( f"{dataset_chunks!r}." ) + # Parse the 'dataset_shards' parameter + if dataset_shards is not None: + if not isinstance(dataset_shards, Integral) or dataset_shards < 1: + raise ValueError( + f"Invalid value for 'dataset_shards' keyword: " + f"{dataset_shards!r}." + ) + # ------------------------------------------------------------ # Parse the 'cfa' keyword # ------------------------------------------------------------ @@ -5173,14 +5555,20 @@ def write( effective_fields = fields if mode == "a": - # First read in the fields from the existing file: + if fmt == "ZARR3": + raise ValueError( + "Can't write with mode 'a' to a Zarr dataset" + ) + + # First read in the fields from the existing dataset: effective_fields = self._NetCDFRead(self.implementation).read( - filename, netcdf_backend="netCDF4" + dataset_name, netcdf_backend="netCDF4" ) # Read rather than append for the first iteration to ensure nothing # gets written; only want to update the 'seen' dictionary first. effective_mode = "r" + overwrite = False self.write_vars["dry_run"] = True @@ -5233,14 +5621,14 @@ def write( ): raise ValueError( "Can't append fields with an incompatible 'featureType' " - "global attribute to the original file." + "global attribute to the original dataset." ) self._file_io_iteration( mode=effective_mode, overwrite=overwrite, fields=effective_fields, - filename=filename, + dataset_name=dataset_name, fmt=fmt, global_attributes=global_attributes, variable_attributes=variable_attributes, @@ -5273,7 +5661,7 @@ def write( mode=mode, overwrite=overwrite, fields=fields, - filename=filename, + dataset_name=dataset_name, fmt=fmt, global_attributes=global_attributes, variable_attributes=variable_attributes, @@ -5299,7 +5687,7 @@ def _file_io_iteration( mode, overwrite, fields, - filename, + dataset_name, fmt, global_attributes, variable_attributes, @@ -5319,11 +5707,11 @@ def _file_io_iteration( warn_valid, group, ): - """Perform a file-writing iteration with the given settings.""" + """Perform a dataset-writing iteration.""" from packaging.version import Version # ------------------------------------------------------------ - # Initiate file IO with given write variables + # Initiate dataset IO with given write variables # ------------------------------------------------------------ if mode == "w": desc = "Writing to" @@ -5361,10 +5749,10 @@ def _file_io_iteration( if group: # Can't write groups to a netCDF-3 file g["group"] = False - elif fmt not in NETCDF4_FMTS: + elif fmt not in NETCDF4_FMTS + ZARR_FMTS: raise ValueError( - f"Unknown output file format: {fmt!r}. " - f"Valid formats are {NETCDF4_FMTS + NETCDF3_FMTS}" + f"Unknown output dataset format: {fmt!r}. " + f"Valid formats are {NETCDF4_FMTS + NETCDF3_FMTS + ZARR_FMTS}" ) # ------------------------------------------------------------ @@ -5381,7 +5769,7 @@ def _file_io_iteration( if "Conventions" in variable_attributes: raise ValueError( "Can't prevent the 'Conventions' property from being " - f"a netCDF global variable: {variable_attributes}" + f"a CF global variable: {variable_attributes}" ) if global_attributes: @@ -5430,6 +5818,13 @@ def _file_io_iteration( g["least_significant_digit"] = least_significant_digit g["fmt"] = fmt + match fmt: + case "ZARR3": + g["backend"] = "zarr" + g["dataset_type"] = "directory" + case _: + g["backend"] = "netCDF4" + g["dataset_type"] = "file" if isinstance( fields, @@ -5456,29 +5851,29 @@ def _file_io_iteration( g["overwrite"] = overwrite # ------------------------------------------------------------ - # Open the output netCDF file + # Open the output dataset # ------------------------------------------------------------ - if os.path.isfile(filename): + if self.dataset_exists(dataset_name): if mode == "w" and not overwrite: raise IOError( - "Can't write with mode {mode!r} to existing file " - f"{os.path.abspath(filename)} unless overwrite=True" + f"Can't write with mode {mode!r} to existing dataset " + f"{os.path.abspath(dataset_name)} unless overwrite=True" ) - if not os.access(filename, os.W_OK): + if not os.access(dataset_name, os.W_OK): raise IOError( - "Can't write to existing file " - f"{os.path.abspath(filename)} without permission" + "Can't write to existing dataset " + f"{os.path.abspath(dataset_name)} without permission" ) else: g["overwrite"] = False - g["filename"] = filename - g["netcdf"] = self.file_open(filename, mode, fmt, fields) + g["dataset_name"] = dataset_name + g["dataset"] = self.dataset_open(dataset_name, mode, fmt, fields) if not g["dry_run"]: # -------------------------------------------------------- - # Write global properties to the file first. This is + # Write global properties to the dataset first. This is # important as doing it later could slow things down # enormously. This function also creates the # g['global_attributes'] set, which is used in the @@ -5487,7 +5882,7 @@ def _file_io_iteration( self._write_global_attributes(fields) # -------------------------------------------------------- - # Write group-level properties to the file next + # Write group-level properties to the dataset next # -------------------------------------------------------- if ( g["group"] and not g["post_dry_run"] @@ -5504,12 +5899,12 @@ def _file_io_iteration( ) external = os.path.expanduser(os.path.expandvars(external)) - if os.path.realpath(external) == os.path.realpath(filename): + if os.path.realpath(external) == os.path.realpath(dataset_name): raise ValueError( - "Can't set filename and external to the " "same path" + "Can't set 'dataset_name' and 'external' to the same path" ) - g["external_file"] = external + g["external_dataset"] = external # ------------------------------------------------------------ # Write each field construct @@ -5520,18 +5915,18 @@ def _file_io_iteration( # ------------------------------------------------------------ # Write all of the buffered data to disk # ------------------------------------------------------------ - # For append mode, it is cleaner code-wise to close the file - # on the read iteration and re-open it for the append + # For append mode, it is cleaner code-wise to close the + # dataset on the read iteration and re-open it for the append # iteration. So we always close it here. - self.file_close(filename) + self.dataset_close() # ------------------------------------------------------------ - # Write external fields to the external file + # Write external fields to the external dataset # ------------------------------------------------------------ - if g["external_fields"] and g["external_file"] is not None: + if g["external_fields"] and g["external_dataset"] is not None: self.write( fields=g["external_fields"], - filename=g["external_file"], + dataset_name=g["external_dataset"], fmt=fmt, overwrite=overwrite, datatype=datatype, @@ -5540,6 +5935,9 @@ def _file_io_iteration( fletcher32=fletcher32, shuffle=shuffle, extra_write_vars=extra_write_vars, + chunk_cache=chunk_cache, + dataset_chunks=g["dataset_chunks"], + dataset_shards=g["dataset_shards"], ) def _int32(self, array): @@ -5578,7 +5976,7 @@ def _int32(self, array): return array def _dimension_in_subgroup(self, v, ncdim): - """Return True if the netCDF dimension is in a valid group. + """Return True if the dimension is in a valid group. Returns True if the dimension is in the same group, or a parent group, as the group defined by the construct. Otherwise @@ -5591,7 +5989,7 @@ def _dimension_in_subgroup(self, v, ncdim): v: Construct ncdim: `str` - The netCDF dimension name. + The dataset dimension name. *Parameter example:* ``'lat'`` @@ -5602,8 +6000,7 @@ def _dimension_in_subgroup(self, v, ncdim): :Returns: `bool` - Whether or not the netCDF dimension is in a valid - group. + Whether or not the dimension is in a valid group. """ v_groups = self.implementation.nc_get_variable_groups(v) @@ -5625,7 +6022,7 @@ def _customise_write_vars(self): pass def _chunking_parameters(self, data, ncdimensions): - """Set chunking parameters for `netCDF4.createVariable`. + """Set chunking parameters for a dataset variable. .. versionadded:: (cfdm) 1.11.2.0 @@ -5635,17 +6032,17 @@ def _chunking_parameters(self, data, ncdimensions): The data being written. ncdimensions: `tuple` - The data netCDF dimensions. + The dataset dimensions of the data. :Returns: - 2-tuple - The *contiguous* and *chunksizes* parameters for - `netCDF4.createVariable`. + 3-tuple + The 'contiguous', 'chunksizes', and 'shards' + parameters for `_createVariable`. """ if data is None: - return False, None + return False, None, None g = self.write_vars @@ -5655,31 +6052,36 @@ def _chunking_parameters(self, data, ncdimensions): # ------------------------------------------------------------ # Get the chunking strategy defined by the data itself chunksizes = self.implementation.nc_get_dataset_chunksizes(data) + shards = self.implementation.nc_get_dataset_shards(data) + if chunksizes == "contiguous": # Contiguous as defined by 'data' - return True, None + return True, None, None # Still here? + if shards is None: + shards = g["dataset_shards"] + dataset_chunks = g["dataset_chunks"] if isinstance(chunksizes, int): # Reset dataset chunks to the integer given by 'data' dataset_chunks = chunksizes elif chunksizes is not None: # Chunked as defined by the tuple of int given by 'data' - return False, chunksizes + return False, chunksizes, shards # Still here? Then work out the chunking strategy from the # dataset_chunks if dataset_chunks == "contiguous": # Contiguous as defined by 'dataset_chunks' - return True, None + return True, None, None # Still here? Then work out the chunks from both the # size-in-bytes given by dataset_chunks (e.g. 1024, or '1 # KiB'), and the data shape (e.g. (12, 73, 96)). if self._compressed_data(ncdimensions): # Base the dataset chunks on the compressed data that is - # going into the file + # going into the dataset d = self.implementation.get_compressed_array(data) else: d = data @@ -5698,11 +6100,11 @@ def _chunking_parameters(self, data, ncdimensions): # (250, 250, 4)). However, we only want one number per # dimension, so we choose the largest: [96, 250]. chunksizes = [max(c) for c in chunksizes] - return False, chunksizes + return False, chunksizes, shards else: # The data is scalar, so 'chunksizes' is () => write the # data contiguously. - return True, None + return True, None, None def _compressed_data(self, ncdimensions): """Whether or not the data is being written in compressed form. @@ -5712,8 +6114,8 @@ def _compressed_data(self, ncdimensions): :Parameters: ncdimensions: `sequence` of `str` - The ordered netCDF dimension names of the data. These - are the dimensions going into the file, and if the + The ordered dataset dimension names of the data. These + are the dimensions going into the dataset, and if the data is compressed will differ from the dimensions implied by the data in memory. @@ -5845,7 +6247,7 @@ def _cfa_write_status(self, ncvar, cfvar, construct_type, domain_axes): return False def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): - """Write an aggregation variable to the netCDF file. + """Write an aggregation variable to the dataset. .. versionadded:: (cfdm) 1.12.0.0 @@ -5856,7 +6258,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): data. ncvar: `str` - The netCDF name for the variable. + The dataset name for the variable. ncdimensions: sequence of `str` @@ -5872,7 +6274,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): g = self.write_vars # ------------------------------------------------------------ - # Write the fragment array variables to the netCDF file + # Write the fragment array variables to the dataset # ------------------------------------------------------------ aggregated_data = data.nc_get_aggregated_data() aggregated_data_attr = [] @@ -5936,7 +6338,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): chunking = None - # Get the fragment array netCDF dimensions from the + # Get the fragment array dataset dimensions from the # 'location' fragment array variable. location_ncdimensions = [] for ncdim, size in zip(ncdimensions, f_uris.shape): @@ -5998,8 +6400,8 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): feature = "unique_values" f_unique_value = cfa[feature] - # Get the fragment array netCDF dimensions from the - # 'value' fragment array variable. + # Get the fragment array dimensions from the 'value' + # fragment array variable. unique_value_ncdimensions = [] for ncdim, size in zip(ncdimensions, f_unique_value.shape): cfa_ncdim = f"a_{ncdim}" @@ -6011,7 +6413,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): unique_value_ncdimensions = tuple(unique_value_ncdimensions) - # Write the fragment array variable to the netCDF dataset + # Write the fragment array variable to the dataset feature_ncvar = self._cfa_write_fragment_array_variable( f_unique_value, aggregated_data.get(feature, f"fragment_{feature}"), @@ -6022,7 +6424,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): # ------------------------------------------------------------ # Add the aggregation variable attributes # ------------------------------------------------------------ - self._write_attributes( + self._write_variable_attributes( None, ncvar, extra={ @@ -6070,33 +6472,32 @@ def _cfa_write_fragment_array_variable( The data to write. ncvar: `str` - The netCDF variable name. + The dataset variable name. ncdimensions: `tuple` of `str` - The fragment array variable's netCDF dimensions. + The fragment array variable's dataset dimensions. attributes: `dict`, optional Any attributes to attach to the variable. chunking: sequence, optional - Set `netCDF4.createVariable` 'contiguous' and - `chunksizes` parameters (in that order) for the - fragment array variable. If not set (the default), - then these parameters are inferred from the data. + Set `_createVariable` 'contiguous', 'chunksizes', and + 'shards' parameters (in that order) for the fragment + array variable. If `None` (the default), then these + parameters are inferred from the data. :Returns: `str` - The netCDF variable name of the fragment array - variable. + The name of the fragment array dataset variable. """ create = not self._already_in_file(data, ncdimensions) if create: - # Create a new fragment array variable in the file, with - # 'contiguous' chunking - ncvar = self._netcdf_name(ncvar) + # Create a new fragment array variable in the dataset, + # with 'contiguous' chunking + ncvar = self._name(ncvar) self._write_netcdf_variable( ncvar, ncdimensions, @@ -6107,7 +6508,7 @@ def _cfa_write_fragment_array_variable( ) else: # This fragment array variable has already been written to - # the file + # the dataset ncvar = self.write_vars["seen"][id(data)]["ncvar"] return ncvar @@ -6184,10 +6585,11 @@ def _cfa_fragment_array_variables(self, data, cfvar): if not data.nc_get_aggregation_write_status(): raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF aggregation variable. " - "This is probably because some fragment values have been " - "changed relative to those in the fragment files, or a " - "rechunking has occured." + f"Can't write {cfvar!r} as a CF aggregation variable. " + "This could be " + "because some fragment values in memory have been " + "changed relative to those in the fragment datasets, " + "or a Dask rechunking has occured, etc." ) # ------------------------------------------------------------ @@ -6221,11 +6623,11 @@ def _cfa_fragment_array_variables(self, data, cfvar): normalise = not uri_default if uri_relative: - # Get the aggregation file directory as an absolute + # Get the aggregation dataset directory as an absolute # URI aggregation_file_directory = g["aggregation_file_directory"] if aggregation_file_directory is None: - uri = urisplit(dirname(g["filename"])) + uri = urisplit(dirname(g["dataset_name"])) if uri.isuri(): aggregation_file_scheme = uri.scheme aggregation_file_directory = uri.geturi() @@ -6236,13 +6638,13 @@ def _cfa_fragment_array_variables(self, data, cfvar): authority="", path=uri.path, ) - fragment = uri.fragment - if fragment is not None: + uri_fragment = uri.fragment + if uri_fragment is not None: # Append a URI fragment. Do this with a # string-append, rather than via # `uricompose` in case the fragment # contains more than one # character. - aggregation_file_directory += f"#{fragment}" + aggregation_file_directory += f"#{uri_fragment}" g["aggregation_file_directory"] = ( aggregation_file_directory @@ -6257,10 +6659,10 @@ def _cfa_fragment_array_variables(self, data, cfvar): data.chunk_indices(), data.chunk_positions() ): # Try to get this Dask chunk's data as a reference to - # fragment file + # fragment dataset fragment = data[index].compute(_force_to_memory=False) try: - filename, address, is_subspace, f_index = ( + dataset_name, address, is_subspace, f_index = ( fragment.get_filename(normalise=normalise), fragment.get_address(), fragment.is_subspace(), @@ -6270,47 +6672,47 @@ def _cfa_fragment_array_variables(self, data, cfvar): # This Dask chunk's data is not a reference to # fragment file raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF " + f"Can't write {cfvar!r} as a CF " "aggregation variable: " f"The Dask chunk in position {position} " f"(defined by data index {index!r}) does not " - "reference a unique fragment file. This is could be " - "because some fragment values have been changed " - "relative to those in the fragment files, or a " - "Dask rechunking has occured, etc." + "reference a unique fragment dataset. This could " + "be because some fragment values in memory have been " + "changed relative to those in the fragment datasets, " + "or a Dask rechunking has occured, etc." ) if is_subspace: # This Dask chunk's data is a reference to - # fragment file, but only to a subspace of it. + # fragment dataset, but only to a subspace of it. raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF " + f"Can't write {cfvar!r} as a CF " "aggregation variable: " f"The Dask chunk in position {position} " f"(defined by data index {index!r}) references " - f"a subspace ({f_index!r}) of the fragment file " + f"a subspace ({f_index!r}) of the fragment dataset " f"{fragment!r}. This might be fixable by setting " - "the 'cfa_write' parameter to the 'read' function." + "the 'cfa_write' keyword in the 'read' function." ) - uri = urisplit(filename) + uri = urisplit(dataset_name) if uri_relative and uri.isrelpath(): - filename = abspath(filename) + dataset_name = abspath(dataset_name) if uri.isabspath(): - # File name is an absolute-path URI reference - filename = uricompose( + # Dataset name is an absolute-path URI reference + dataset_name = uricompose( scheme="file", authority="", path=uri.path, ) - fragment = uri.fragment - if fragment is not None: + uri_fragment = uri.fragment + if uri_fragment is not None: # Append a URI fragment. Do this with a # string-append, rather than via `uricompose` # in case the fragment contains more than one # # character. - filename += f"#{fragment}" + dataset_name += f"#{uri_fragment}" if uri_relative: scheme = uri.scheme @@ -6319,21 +6721,21 @@ def _cfa_fragment_array_variables(self, data, cfvar): if scheme != aggregation_file_scheme: raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF " + f"Can't write {cfvar!r} as a CF " "aggregation variable: " "Attempting to create a relative-path URI " - f"reference for the fragment file {fragment}, " + f"reference for the fragment dataset {fragment}, " "referenced by the Dask chunk in position " f"{position} (defined by data index {index!r}), " - "but the aggregation file URI scheme " + "but the aggregation dataset URI scheme " f"({aggregation_file_scheme}:) is incompatible." ) - filename = relpath( - filename, start=aggregation_file_directory + dataset_name = relpath( + dataset_name, start=aggregation_file_directory ) - aggregation_uris.append(filename) + aggregation_uris.append(dataset_name) aggregation_identifiers.append(address) # Reshape the 1-d aggregation instruction arrays to span @@ -6376,7 +6778,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): d.persist(inplace=True) except AggregationError as error: raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF aggregation " + f"Can't write {cfvar!r} as a CF aggregation " "variable. " "At least one Dask chunk has more than one unique value: " f"{error}. " @@ -6390,7 +6792,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): return out def _write_quantization_container(self, quantization): - """Write a CF-netCDF quantization container variable. + """Write a CF quantization container variable. .. note:: It is assumed, but not checked, that the per-variable parameters (such as "quantization_nsd" @@ -6407,7 +6809,7 @@ def _write_quantization_container(self, quantization): :Returns: `str` - The netCDF variable name for the quantization + The dataset variable name for the quantization container. """ @@ -6419,18 +6821,17 @@ def _write_quantization_container(self, quantization): return ncvar # Create a new quantization container variable - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( quantization, default="quantization" ) logger.info( - f" Writing {quantization!r} to netCDF variable: {ncvar}" + f" Writing {quantization!r} to variable: {ncvar}" ) # pragma: no cover kwargs = { "varname": ncvar, "datatype": "S1", - "dimensions": (), "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) @@ -6438,13 +6839,55 @@ def _write_quantization_container(self, quantization): if not g["dry_run"]: # Create the variable self._createVariable(**kwargs) - - # Set the attributes - g["nc"][ncvar].setncatts( - self.implementation.parameters(quantization) + self._set_attributes( + self.implementation.parameters(quantization), ncvar ) # Update the quantization dictionary g["quantization"][ncvar] = quantization return ncvar + + def _missing_value(self, x, datatype): + """Get the missing value. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + x: construct or `Data` + The data for which to get the missing value. + + datatype: `str` or str + The data type, e.g. ``'S1'``, ``'f4'``, `str`. Used + to get the netCDF default fill value, but only when a + missing value can't be found from the attributes of + *x*. + + :Returns: + + The missing value, or `None` if no missing value could + be found. + + """ + try: + # Try 'x' as a construct + mv = x.get_property("_FillValue", None) + if mv is None: + mv = x.get_property("missing_value", None) + except AttributeError: + try: + # Try 'x' as a `Data` object + mv = getattr(x, "fill_value", None) + except AttributeError: + mv = None + + if mv is None: + # Try to get the netCDF default fill value + import netCDF4 + + mv = netCDF4.default_fillvals.get(datatype) + if mv is None and datatype is str: + mv = "" + + return mv diff --git a/cfdm/read_write/netcdf/zarr.py b/cfdm/read_write/netcdf/zarr.py index 564bd7d68..d6651898f 100644 --- a/cfdm/read_write/netcdf/zarr.py +++ b/cfdm/read_write/netcdf/zarr.py @@ -8,7 +8,7 @@ class ZarrDimension: """ - def __init__(self, name, size, group): + def __init__(self, name, size, group, reference_variable=None): """**Initialisation** :Parameters: @@ -22,10 +22,16 @@ def __init__(self, name, size, group): group: `zarr.Group` The group that the dimension is a member of. + reference_variable: `zarr.Array`, optional + The variable that provided the dimension definition. + + .. versionadded:: (cfdm) NEXTVERSION + """ - self.name = name - self.size = size - self.group = group + self._name = name + self._size = size + self._group = group + self._reference_variable = reference_variable def __len__(self): """The size of the dimension. @@ -37,6 +43,34 @@ def __len__(self): """ return self.size + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + .. versionadded:: (cfdm) 1.12.2.0 + + """ + return f"" + + @property + def name(self): + """Return the dimension name. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self._name + + @property + def size(self): + """Return the dimension size. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self._size + def group(self): """Return the group that the dimension is a member of. @@ -48,7 +82,7 @@ def group(self): The group containing the dimension. """ - return self.group + return self._group def isunlimited(self): """Whether or not the dimension is unlimited. @@ -64,3 +98,21 @@ def isunlimited(self): """ return False + + def reference_variable(self): + """Return the variable that provided the dimension definition. + + Note that the variable does not have to be in the dimension's + `group`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `zarr.Array` or `None` + The variable that provided the dimension definition, + or `None` if it wasn't provided during instance + initialisation. + + """ + return self._reference_variable diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 6899ada3e..4e1bf3395 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -17,7 +17,8 @@ class read(ReadWrite): """Read field or domain constructs from a dataset. - The following file formats are supported: netCDF, CDL, and Zarr. + The following dataset formats are supported: netCDF, CDL, and + Zarr. NetCDF and Zarr datasets may be on local disk, on an OPeNDAP server, or in an S3 object store. @@ -169,6 +170,10 @@ class read(ReadWrite): .. versionadded:: (cfdm) 1.11.2.0 + {{read store_dataset_shards: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + {{read cfa: `dict`, optional}} .. versionadded:: (cfdm) 1.12.0.0 @@ -189,6 +194,10 @@ class read(ReadWrite): .. versionadded:: (cfdm) 1.12.0.0 + {{read group_dimension_search: `str`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + ignore_unknown_type: Deprecated at version 1.12.2.0 Use *dataset_type* instead. @@ -237,6 +246,7 @@ def __new__( cache=True, dask_chunks="storage-aligned", store_dataset_chunks=True, + store_dataset_shards=True, cfa=None, cfa_write=None, to_memory=False, @@ -247,6 +257,7 @@ def __new__( followlinks=False, cdl_string=False, extra_read_vars=None, + group_dimension_search="closest_ancestor", **kwargs, ): """Read field or domain constructs from datasets. @@ -370,7 +381,10 @@ def _datasets(self): if isdir(x): if is_zarr(x): # This directory is a Zarr dataset, so don't - # look in any subdirectories. + # look in any subdirectories, which contain + # the dataset chunks (but note - it is allowed + # for non-chunk subdirectories to exist, but + # if they do we're going to ignore them!). n_datasets += 1 yield x continue @@ -565,6 +579,7 @@ def _read(self, dataset): "cache", "dask_chunks", "store_dataset_chunks", + "store_dataset_shards", "cfa", "cfa_write", "to_memory", @@ -573,6 +588,7 @@ def _read(self, dataset): "dataset_type", "cdl_string", "extra_read_vars", + "group_dimension_search", ) } diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 85f2df22d..d9ad7c88b 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -6,21 +6,23 @@ class write(ReadWrite): - """Write field and domain constructs to a netCDF file. + """Write field and domain constructs to a dataset. - **File format** + The following dataset formats are supported: netCDF and Zarr. - See the *fmt* parameter for details on which output netCDF file + **Dataset format** + + See the *fmt* parameter for details on which output dataset formats are supported. - **NetCDF variable and dimension names** + **Dataset variable and dimension names** - These names are stored within constructs read a from dataset, or - may be set manually. They are used when writing a field construct - to the file. If a name has not been set then one will be - constructed (usually based on the standard name if it exists). The - names may be modified internally to prevent duplication in the - file. + These names are stored within constructs and are either read + from another dataset or may be set manually. They are used when + writing a field construct to the dataset. If a name has not been + set then one will be constructed (usually based on the standard + name if it exists). The names may be modified internally to + prevent duplication in the dataset. Each construct, or construct component, that corresponds to a netCDF variable has the following methods to get, set and remove a @@ -33,19 +35,19 @@ class write(ReadWrite): `~{{package}}.DomainAxis.nc_set_dimension` and `~{{package}}.DomainAxis.nc_del_dimension`. - **NetCDF attributes** + **Dataset attributes** - Field construct properties may be written as netCDF global - attributes and/or netCDF data variable attributes. See the - *file_descriptors*, *global_attributes* and *variable_attributes* - parameters for details. + Field construct properties may be written as global attributes + (i.e. attributes of the root group) and/or data variable + attributes. See the *file_descriptors*, *global_attributes* and + *variable_attributes* parameters for details. **External variables** - Metadata constructs marked as external are omitted from the file - and referred to via the netCDF ``external_variables`` global - attribute. However, omitted constructs may be written to an - external file (see the *external* parameter for details). + Metadata constructs marked as external are omitted from the + dataset and referred to via the CF ``external_variables`` global + attribute. However, the omitted constructs may be written to an + external dataset (see the *external* parameter for details). **NetCDF unlimited dimensions** @@ -81,14 +83,12 @@ class write(ReadWrite): :Parameters: fields: (sequence of) `Field` or `Domain` - The field and domain constructs to write to the file. - - filename: `str` - The output netCDF file name. Various type of expansion are - applied to the file names. + The field and domain constructs to write to the dataset. - Relative paths are allowed, and standard tilde and shell - parameter expansions are applied to the string. + dataset_name: `str` + The output dataset name. Relative paths are allowed, and + standard tilde and shell parameter expansions are applied + to the string. *Parameter example:* The file ``file.nc`` in the user's home directory could @@ -97,10 +97,10 @@ class write(ReadWrite): ``'~/file.nc'``, ``'~/tmp/../file.nc'``. fmt: `str`, optional - The format of the output file. One of: + The format of the output dataset. One of: ========================== ============================== - *fmt* Output file type + *fmt* Output dataset type ========================== ============================== ``'NETCDF4'`` NetCDF4 format file. This is the default. @@ -121,12 +121,14 @@ class write(ReadWrite): ``'NETCDF3_64BIT_DATA'`` NetCDF3 64-bit offset format file with extensions (see below) + + ``'ZARR3'`` Zarr v3 dataset ========================== ============================== By default the format is ``'NETCDF4'``. - All formats support large files (i.e. those greater than - 2GB) except ``'NETCDF3_CLASSIC'``. + All NETCDF formats support large files (i.e. those greater + than 2GB), except ``'NETCDF3_CLASSIC'``. ``'NETCDF3_64BIT_DATA'`` is a format that requires version 4.4.0 or newer of the C library (use @@ -146,19 +148,20 @@ class write(ReadWrite): and use the new features of the version 4 API. mode: `str`, optional - Specify the mode of write access for the output file. One of: + Specify the mode of write access for the output + datset. One of: ======== ================================================= *mode* Description ======== ================================================= - ``'w'`` Open a new file for writing to. If it exists and - *overwrite* is True then the file is deleted - prior to being recreated. + ``'w'`` Open a new dataset for writing to. If it exists + and *overwrite* is True then the dataset is + deleted prior to being recreated. - ``'a'`` Open an existing file for appending new + ``'a'`` Open an existing dataset for appending new information to. The new information will be incorporated whilst the original contents of the - file will be preserved. + dataset will be preserved. In practice this means that new fields will be created, whilst the original fields will not be @@ -167,7 +170,7 @@ class write(ReadWrite): For append mode, note the following: - * Global attributes on the file + * Global attributes on the dataset will remain the same as they were originally, so will become inaccurate where appended fields have incompatible attributes. To rectify this, @@ -177,7 +180,7 @@ class write(ReadWrite): `nc_set_global_attribute`. * Fields with incompatible ``featureType`` to - the original file cannot be appended. + the original dataset cannot be appended. * At present fields with groups cannot be appended, but this will be possible in a future @@ -193,16 +196,18 @@ class write(ReadWrite): resultant fields using `set_domain_ancillary` and similar methods if required. + .. note: Zarr datasets can not be appended to. + ``'r+'`` Alias for ``'a'``. ======== ================================================= - By default the file is opened with write access mode + By default the dataset is opened with write access mode ``'w'``. overwrite: `bool`, optional - If False then raise an error if the output file - pre-exists. By default a pre-existing output file is + If False then raise an error if the output dataset + pre-exists. By default a pre-existing output dataset is overwritten. Conventions: (sequence of) `str`, optional @@ -261,8 +266,8 @@ class write(ReadWrite): construct properties, which are created as netCDF global attributes by default: - * the description of file contents properties (as defined - by the CF conventions), and + * the description of dataset contents properties (as + defined by the CF conventions), and * properties flagged as global on any of the field constructs being written (see @@ -278,7 +283,7 @@ class write(ReadWrite): data variable corresponding to each field construct that contains the property. - Any global attributes that are also specified as file + Any global attributes that are also specified as dataset descriptors will not be written as netCDF global variables, but as netCDF data variable attributes instead. @@ -316,8 +321,8 @@ class write(ReadWrite): external: `str`, optional Write metadata constructs that have data and are marked as - external to the named external file. Ignored if there are - no such constructs. + external to the named external dataset. Ignored if there + are no such constructs. datatype: `dict`, optional Specify data type conversions to be applied prior to @@ -339,11 +344,11 @@ class write(ReadWrite): numpy.dtype('int32')}``. endian: `str`, optional - The endian-ness of the output file. Valid values are + The endian-ness of the output dataset. Valid values are ``'little'``, ``'big'`` or ``'native'``. By default the output is native endian. See the `netCDF4 package `_ for more - details. + details. Ignored for Zarr datsets. *Parameter example:* ``endian='big'`` @@ -374,8 +379,8 @@ class write(ReadWrite): ``least_significant_digit=3`` chunk_cache: `int` or `None`, optional - The amount of memory (in bytes) used in each variable's - chunk cache at the HDF5 level. + The amount of memory (in bytes) used in each HDF5 + variable's chunk cache. Ignored when not writing to a netCDF-4 format. By default, or if `None`, the default netCDF-C chunk cache size of @@ -420,14 +425,14 @@ class write(ReadWrite): string: `bool`, optional By default string-valued construct data are written as - netCDF arrays of type string if the output file format is - ``'NETCDF4'``, or of type char with an extra dimension - denoting the maximum string length for any other output - file format (see the *fmt* parameter). If *string* is False - then string-valued construct data are written as netCDF - arrays of type char with an extra dimension denoting the - maximum string length, regardless of the selected output - file format. + netCDF arrays of type string if the output dataset format + is ``'NETCDF4'`` or ``'ZARR3'``, or of type char with an + extra dimension denoting the maximum string length for any + other output dataset format (see the *fmt* parameter). If + *string* is False then string-valued construct data are + written as netCDF arrays of type char with an extra + dimension denoting the maximum string length, regardless + of the selected output dataset format. .. versionadded:: (cfdm) 1.8.0 @@ -469,7 +474,7 @@ class write(ReadWrite): The consequence of writing out-of-range data values is that, by default, these values will be masked when the - file is subsequently read. + dataset is subsequently read. *Parameter example:* If a construct has ``valid_max`` property with value @@ -480,11 +485,11 @@ class write(ReadWrite): .. versionadded:: (cfdm) 1.8.3 group: `bool`, optional - If False then create a "flat" netCDF file, i.e. one with - only the root group, regardless of any group structure + If False then create a "flat" dataset, i.e. one with only + the root group, regardless of any group structure specified by the field constructs. By default any groups - defined by the netCDF interface of the field constructs and - its components will be created and populated. + defined by the netCDF interface of the field constructs + and its components will be created and populated. .. versionadded:: (cfdm) 1.8.6 @@ -500,11 +505,11 @@ class write(ReadWrite): Do not write the data of the named construct types. This does not affect the amount of netCDF variables and - dimensions that are written to the file, nor the netCDF + dimensions that are written to the dataset, nor the netCDF variables' attributes, but does not create data on disk - for the requested variables. The resulting file will be + for the requested variables. The resulting dataset will be smaller than it otherwise would have been, and when the - new file is read the data of these variables will be + new dataset is read the data of these variables will be represented by an array of all missing data. The *omit_data* parameter may be one, or a sequence, of: @@ -535,8 +540,8 @@ class write(ReadWrite): .. versionadded:: (cfdm) 1.10.0.1 dataset_chunks: `str` or `int` or `float`, optional - The dataset chunking strategy for data arrays being written - to the file. + The dataset chunking strategy for data arrays being + written to the dataset. By default, *dataset_chunks* is ``'4 MiB'``, i.e. 4194304 bytes. @@ -552,21 +557,24 @@ class write(ReadWrite): .. note:: By default, a data array returned by `{{package}}.read` stores its dataset chunking - strategy from the file being read. When this + strategy from the dataset being read. When this happens that same dataset chunking strategy will be used when the data is written to a new - netCDF4 file, unless the strategy was modified - or removed prior to writing. To prevent the - dataset chunking strategy from the original file - being stored, see the *store_dataset_chunks* - parameter to `{{package}}.read`. + netCDF4 or Zarr dataset, unless the strategy was + modified or removed prior to writing. To prevent + the dataset chunking strategy from the original + dataset being stored, see the + *store_dataset_chunks* parameter to + `{{package}}.read`. The *dataset_chunks* parameter may be one of: * ``'contiguous'`` - The data will written to the file contiguously, i.e. no - chunking. + The data will be written to the dataset contiguously, + i.e. no chunking. For a Zarr dataset, this is + implemented as a single dataset chunk for the entire + array. * `int` or `float` or `str` @@ -587,7 +595,7 @@ class write(ReadWrite): number of bytes (taken either from the *dataset_chunks* parameter, or as stored by the data itself), "square-like" dataset chunk shapes - are preferred that maximise the amount of chunks + are preferred that maximise the number of chunks that are completely filled with data values. For example, with *dataset_chunks* of ``'4 MiB'``, a data array of 64-bit floats with shape (400, @@ -611,6 +619,44 @@ class write(ReadWrite): .. versionadded:: (cfdm) 1.12.0.0 + dataset_shards: `None` or `int`, optional + When writing to a Zarr dataset, sharding provides a + mechanism to store multiple dataset chunks in a single + storage object or file. Without sharding, each dataset + chunk is written to its own file. Traditional file systems + and object storage systems may have performance issues + storing and accessing large number of files, and small + files can be inefficient to store if they are smaller than + the block size of the file system. Sharding can improve + performance by creating fewer, and larger, files for + storing the dataset chunks. + + The *dataset_shards* parameter is ignored when writing to + a non-Zarr datset. + + If any `Data` being written already stores its own dataset + sharding strategy (i.e. its `Data.nc_dataset_shards` + method returns something other than `None`) then, for that + data array alone, it is used in preference to the strategy + defined by the *dataset_shards* parameter. + + The *dataset_shards* parameter may be one of: + + * `None` + + No sharding. + + * `int` + + The integer number of dataset chunks to be stored in a + single shard, favouring an equal number of dataset + chunks along each shard dimension. + + *Example:* + For two-dimensional data, ``dataset_shards=9`` will + result in shards that span 3 chunks along each + dimension. + cfa: `str` or `dict` or `None`, optional Specify which netCDF variables, if any, should be written as CF-netCDF aggregation variables. @@ -713,18 +759,18 @@ class write(ReadWrite): * ``'uri'``: `str` - Specify the URI format of the fragment file names. + Specify the URI format of the fragment dataset names. - If ``'default'`` (the default) then the fragment file + If ``'default'`` (the default) then the fragment dataset names will be written with the same URI formats that - they had when read from input files (for file names - originating from the reading of normal non-aggregation - variables, this will result in absolute URIs). If - ``'absolute'`` then all fragment file names will be - written as absolute URIs. If ``'relative'`` then all - fragment file names will be written as relative-path URI - references URIs, relative to the location of the - aggregation file. + they had when read from input datasets (for dataset + names originating from the reading of normal + non-aggregation variables, this will result in absolute + URIs). If ``'absolute'`` then all fragment dataset names + will be written as absolute URIs. If ``'relative'`` then + all fragment dataset names will be written as + relative-path URI references, relative to the + location of the aggregation dataset. * ``'strict'``: `bool` @@ -740,6 +786,9 @@ class write(ReadWrite): Define the CF data model implementation that defines field and metadata constructs and their components. + filename: Deprecated at version NEXTVERSION + Use *dataset_name* instead. + :Returns: `None` @@ -761,7 +810,7 @@ class write(ReadWrite): def __new__( cls, fields, - filename, + dataset_name, fmt="NETCDF4", mode="w", overwrite=True, @@ -787,10 +836,11 @@ def __new__( coordinates=False, omit_data=None, dataset_chunks="4 MiB", + dataset_shards=None, cfa="auto", extra_write_vars=None, ): - """Write field and domain constructs to a netCDF file.""" + """Write field and domain constructs to a dataset.""" # Flatten the sequence of intput fields fields = tuple(cls._flat(fields)) if not fields: @@ -825,7 +875,7 @@ def __new__( netcdf = NetCDFWrite(cls.implementation) netcdf.write( fields, - filename, + dataset_name, fmt=fmt, mode=mode, overwrite=overwrite, @@ -850,5 +900,6 @@ def __new__( extra_write_vars=extra_write_vars, omit_data=omit_data, dataset_chunks=dataset_chunks, + dataset_shards=dataset_shards, cfa=cfa, ) diff --git a/cfdm/test/test_Data.py b/cfdm/test/test_Data.py index fc631d1b8..4de1db103 100644 --- a/cfdm/test/test_Data.py +++ b/cfdm/test/test_Data.py @@ -2877,6 +2877,40 @@ def test_Data_dtype(self): if a is not np.ma.masked: self.assertEqual(a.dtype, d.dtype) + def test_Data_dataset_shards(self): + """Test Data.nc_dataset_shards.""" + d = cfdm.Data(np.arange(24).reshape(2, 3, 4)) + + self.assertIsNone(d.nc_dataset_shards()) + self.assertIsNone(d.nc_set_dataset_shards([1, 2, 3])) + self.assertEqual(d.nc_dataset_shards(), (1, 2, 3)) + self.assertEqual(d.nc_clear_dataset_shards(), (1, 2, 3)) + self.assertIsNone(d.nc_dataset_shards()) + + self.assertIsNone(d.nc_set_dataset_shards(None)) + self.assertIsNone(d.nc_dataset_shards()) + + self.assertIsNone(d.nc_set_dataset_shards(100)) + self.assertEqual(d.nc_dataset_shards(), 100) + + # Check that shards get copied + self.assertEqual(d.copy().nc_dataset_shards(), 100) + + # Bad shards + for shards in ( + [2], + [-99, 3, 4], + [2, 3, 3.14], + ["bad", 3, 4], + [2, None, 4], + [2, 3, -1], + "bad", + -1, + 3.14, + ): + with self.assertRaises(ValueError): + d.nc_set_dataset_shards(shards) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index 5bb7b2fb4..09c34d5d5 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -171,6 +171,7 @@ def test_groups(self): # ------------------------------------------------------------ name = "grid_latitude" g.construct(name).bounds.nc_set_variable_groups(["forecast"]) + grouped_file = "grouped_file.nc" cfdm.write(g, grouped_file) nc = netCDF4.Dataset(grouped_file, "r") @@ -304,6 +305,7 @@ def test_groups_geometry(self): g.nc_set_component_variable("interior_ring", "interior_ring") g.nc_set_component_variable_groups("interior_ring", ["forecast"]) + grouped_file = "grouped_file.nc" cfdm.write(g, grouped_file) # Check that the variable is in the right group diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py new file mode 100644 index 000000000..3f24ea303 --- /dev/null +++ b/cfdm/test/test_zarr.py @@ -0,0 +1,341 @@ +import atexit +import datetime +import faulthandler +import os +import shutil +import tempfile +import unittest + +faulthandler.enable() # to debug seg faults and timeouts + +import zarr + +import cfdm + +warnings = False + +# Set up temporary directories +tmpdirs = [ + tempfile.mkdtemp("_test_zarr.zarr", dir=os.getcwd()) for i in range(2) +] +[tmpdir1, tmpdir2] = tmpdirs + +# Set up temporary files +tmpfiles = [ + tempfile.mkstemp("_test_zarr.nc", dir=os.getcwd())[1] for i in range(2) +] +[tmpfile1, tmpfile2] = tmpfiles + + +def _remove_tmpdirs(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + for d in tmpdirs: + try: + shutil.rmtree(d) + os.rmdir(d) + except OSError: + pass + + +atexit.register(_remove_tmpdirs) + + +class read_writeTest(unittest.TestCase): + """Test the reading and writing of field constructs from/to disk.""" + + f0 = cfdm.example_field(0) + + def setUp(self): + """Preparations called immediately before each test method.""" + # Disable log messages to silence expected warnings + cfdm.LOG_LEVEL("DISABLE") + # Note: to enable all messages for given methods, lines or + # calls (those without a 'verbose' option to do the same) + # e.g. to debug them, wrap them (for methods, start-to-end + # internally) as follows: cfdm.LOG_LEVEL('DEBUG') + # + # < ... test code ... > + # cfdm.log_level('DISABLE') + + def test_zarr_read_write_1(self): + """Test Zarr read/write on example fields.""" + for i, f in enumerate(cfdm.example_fields()): + if i in (8, 9, 10): + # Can't write UGRID yet + continue + + cfdm.write(f, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1) + self.assertEqual(len(z), 1) + z = z[0] + self.assertTrue(z.equals(f)) + + # Check that the Zarr and netCDF4 encodings are equivalent + cfdm.write(f, tmpfile1, fmt="NETCDF4") + n = cfdm.read(tmpfile1)[0] + self.assertTrue(z.equals(n)) + + def test_zarr_read_write_2(self): + """Test Zarr read/write on various netCDF files.""" + for filename in ( + "DSG_timeSeries_contiguous.nc", + "DSG_timeSeries_indexed.nc", + "DSG_timeSeriesProfile_indexed_contiguous.nc", + "gathered.nc", + "geometry_1.nc", + "geometry_2.nc", + "geometry_3.nc", + "geometry_4.nc", + "string_char.nc", + ): + n = cfdm.read(filename) + cfdm.write(n, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1) + self.assertEqual(len(z), len(n)) + for a, b in zip(z, n): + self.assertTrue(a.equals(b)) + + def test_zarr_read_write_chunks_shards(self): + """Test Zarr read/write with chunks and shards.""" + f = self.f0.copy() + f.data.nc_set_dataset_chunksizes([2, 3]) + + cfdm.write(f, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertIsNone(z["q"].shards) + + # Make shards comprising 4 chunks + cfdm.write(f, tmpdir1, fmt="ZARR3", dataset_shards=4) + z = cfdm.read(tmpdir1, store_dataset_shards=False)[0] + self.assertTrue(z.equals(f)) + self.assertIsNone(z.data.nc_dataset_shards()) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + + for shards in (4, [2, 2]): + f.data.nc_set_dataset_shards(shards) + cfdm.write(f, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + self.assertEqual(z.data.nc_dataset_shards(), (2, 2)) + + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + + def test_zarr_read_write_CFA(self): + """Test CF aggregation in Zarr.""" + f = self.f0 + + cfdm.write(f, tmpdir1, fmt="ZARR3") + cfdm.write(f, tmpfile1, fmt="NETCDF4") + + z = cfdm.read(tmpdir1, cfa_write="field")[0] + n = cfdm.read(tmpfile1, cfa_write="field")[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + cfdm.write(z, tmpdir2, fmt="ZARR3", cfa="field") + cfdm.write(n, tmpfile2, fmt="NETCDF4", cfa="field") + + z = cfdm.read(tmpdir2)[0] + n = cfdm.read(tmpfile2)[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + def test_zarr_write_append(self): + """Test in append mode with Zarr.""" + # Check that append mode does not work for Zarr + f = self.f0 + cfdm.write(f, tmpdir1, fmt='ZARR3') + with self.assertRaises(ValueError): + cfdm.write(f, tmpdir1, fmt='ZARR3', mode="a") + + def test_zarr_groups_1(self): + """Test for the general handling of Zarr hierarchical groups.""" + f = cfdm.example_field(1) + + # Add a second grid mapping + datum = cfdm.Datum(parameters={"earth_radius": 7000000}) + conversion = cfdm.CoordinateConversion( + parameters={"grid_mapping_name": "latitude_longitude"} + ) + + grid = cfdm.CoordinateReference( + coordinate_conversion=conversion, + datum=datum, + coordinates=["auxiliarycoordinate0", "auxiliarycoordinate1"], + ) + + f.set_construct(grid) + + grid0 = f.construct("grid_mapping_name:rotated_latitude_longitude") + grid0.del_coordinate("auxiliarycoordinate0") + grid0.del_coordinate("auxiliarycoordinate1") + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.construct("grid_latitude").bounds.nc_set_variable_groups( + ["forecast"] + ) + for name in ( + "longitude", # Auxiliary coordinate + "latitude", # Auxiliary coordinate + "long_name=Grid latitude name", # Auxiliary coordinate + "measure:area", # Cell measure + "surface_altitude", # Domain ancillary + "air_temperature standard_error", # Field ancillary + "grid_mapping_name:rotated_latitude_longitude", + "time", # Dimension coordinate + "grid_latitude", # Dimension coordinate + ): + f.construct(name).nc_set_variable_groups(["forecast"]) + + # Check the groups + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") + + n = cfdm.read(grouped_file)[0] + z = cfdm.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + # Directly check the groups in the Zarr dataset + x = zarr.open(grouped_dir) + self.assertEqual(list(x.group_keys()), ["forecast"]) + self.assertEqual(list(x["forecast"].group_keys()), ["model"]) + + cfdm.write(z, tmpdir2, fmt="ZARR3") + z1 = cfdm.read(tmpdir2)[0] + self.assertTrue(z1.equals(f)) + + def test_zarr_groups_dimension(self): + """Test Zarr groups dimensions.""" + f = self.f0.copy() + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + for construct in f.constructs.filter_by_data().values(): + construct.nc_set_variable_groups(["forecast"]) + + for construct in f.coordinates().values(): + try: + construct.bounds.nc_set_variable_groups(["forecast"]) + except ValueError: + pass + + domain_axis = f.domain_axis("latitude") + domain_axis.nc_set_dimension_groups(["forecast"]) + + # Check the groups + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") + + n = cfdm.read(grouped_file)[0] + z = cfdm.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + # Check that grouped netCDF datasets can only be read with + # 'closest_ancestor' + cfdm.read(grouped_file, group_dimension_search="closest_ancestor") + for gsn in ("furthest_ancestor", "local", "BAD VALUE"): + with self.assertRaises(ValueError): + cfdm.read(grouped_file, group_dimension_search=gsn) + + def test_zarr_groups_DSG(self): + """Test Zarr groups containing DSGs.""" + f = cfdm.example_field(4) + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + f.compress("indexed_contiguous", inplace=True) + f.data.get_count().nc_set_variable("count") + f.data.get_index().nc_set_variable("index") + + # Set some groups. (Write the read the field first to create + # the compressions variables on disk.) + cfdm.write(f, tmpfile2) + f = cfdm.read(tmpfile2)[0] + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.data.get_count().nc_set_variable_groups(["forecast"]) + f.data.get_index().nc_set_variable_groups(["forecast"]) + f.construct("altitude").nc_set_variable_groups(["forecast"]) + f.data.get_count().nc_set_sample_dimension_groups(["forecast"]) + + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") + + n = cfdm.read(grouped_file) + z = cfdm.read(grouped_dir) + + n = n[0] + z = z[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + def test_zarr_groups_geometry(self): + """Test Zarr groups containing cell geometries.""" + f = cfdm.example_field(6) + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + cfdm.write(f, tmpfile2) + f = cfdm.read(tmpfile2)[0] + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.nc_set_geometry_variable_groups(["forecast"]) + f.coordinate("longitude").bounds.nc_set_variable_groups(["forecast"]) + f.nc_set_component_variable_groups("node_count", ["forecast"]) + f.nc_set_component_variable_groups("part_node_count", ["forecast"]) + f.nc_set_component_variable("interior_ring", "interior_ring") + f.nc_set_component_variable_groups("interior_ring", ["forecast"]) + + # Check the groups + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") + + n = cfdm.read(grouped_file)[0] + z = cfdm.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + def test_zarr_read_v2(self): + """Test reading Zarr v2.""" + f2 = cfdm.read("example_field_0.zarr2") + f3 = cfdm.read("example_field_0.zarr3") + self.assertEqual(len(f2), len(f3)) + self.assertEqual(len(f2), 1) + self.assertTrue(f2[0].equals(f3[0])) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cfdm.environment() + print("") + unittest.main(verbosity=2) diff --git a/docs/source/functions.rst b/docs/source/functions.rst index e26e414c7..d0f9f42a7 100644 --- a/docs/source/functions.rst +++ b/docs/source/functions.rst @@ -20,8 +20,8 @@ Reading and writing cfdm.read cfdm.write - cfdm.netcdf_flatten - cfdm.netcdf_index + cfdm.dataset_flatten + cfdm.netcdf_indexer Mathematical operations ----------------------- diff --git a/docs/source/installation.rst b/docs/source/installation.rst index d9a345691..2b517b39c 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -178,6 +178,11 @@ Tests are run from within the ``cfdm/test`` directory: **Dependencies** ---------------- +.. _Required: + +Required +^^^^^^^^ + The cfdm package requires: * `Python `_, version 3.10 or newer. @@ -195,8 +200,6 @@ The cfdm package requires: * `h5py `_, version 3.12.1 or newer. -* `zarr `_, version 3.0.8 or newer. - * `s3fs `_, version 2024.6.0 or newer. * `dask `_, version 2025.5.1 or newer. @@ -237,6 +240,19 @@ The cfdm package requires: then setting the ``UDUNITS2_XML_PATH`` environment variable is the likely solution. +Optional +^^^^^^^^ + +Some further dependencies that enable further functionality are +optional. This is to facilitate cfdm being installed in restricted +environments for which these features are not required. + +.. rubric:: Zarr + +* `zarr `_, version 3.1.3 or newer. + + For reading and writing Zarr datasets. + ---- .. _Code-repository: diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 09a3877db..8b80caa07 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -85,7 +85,8 @@ The `cfdm` package can * create new field and domain constructs in memory, -* write field and domain constructs to netCDF datasets on disk, +* write field and domain constructs to netCDF and Zarr datasets on + disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 7a07c8785..8b42c92f6 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -3465,7 +3465,7 @@ Method Classes ------------------- The `cfdm.write` function writes a field construct, or a sequence of -field constructs, to a netCDF file on disk: +field constructs, to a netCDF or Zarr dataset on disk: .. code-block:: python :caption: *Write a field construct to a netCDF dataset on disk.* @@ -3535,8 +3535,8 @@ By default the output file will be for CF-|version|. The `cfdm.write` function has optional parameters to -* set the output netCDF format (all netCDF3 and netCDF4 formats are - possible); +* set the output dataset format (all netCDF3 and netCDF4 formats, as + well as Zarr v3 are possible); * append to the netCDF file rather than over-writing it by default; diff --git a/requirements.txt b/requirements.txt index 3b37323a5..f41a3712e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,3 @@ dask>=2025.5.1 distributed>=2025.5.1 uritools>=4.0.3 cfunits>=3.3.7 -zarr>=3.0.8 diff --git a/setup.py b/setup.py index 94fa75509..e9d5fd2be 100755 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ def _get_version(): * read field and domain constructs from netCDF, CDL, and Zarr datasets with a choice of netCDF backends, * be fully flexible with respect to dataset storage chunking, * create new field and domain constructs in memory, -* write and append field and domain constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr v3 datasets on disk, * read, write, and manipulate UGRID mesh topologies, * read, write, and create coordinates defined by geometry cells, * read and write netCDF4 string data-type variables, @@ -136,6 +136,9 @@ def _get_version(): "flake8", "pydocstyle", ], + "zarr": [ + "zarr>=3.1.3", + ], } setup(