From ad3f63f744eafddd57b936e2c6cdd8877a0cdb25 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 22 Aug 2025 13:50:31 +0100 Subject: [PATCH 01/39] dev --- cfdm/read_write/netcdf/netcdfwrite.py | 81 ++++++++++++++++++++------- 1 file changed, 61 insertions(+), 20 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index ddc27e5be..7982c37d2 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -286,8 +286,9 @@ def _write_attributes(self, parent, ncvar, extra=None, omit=()): del netcdf_attrs["_FillValue"] if not g["dry_run"]: - g["nc"][ncvar].setncatts(netcdf_attrs) - + # TODOZARR + self._aaa(ncvar, netcdf_attrs) + if skip_set_fill_value: # Re-add as known attribute since this FV is already set netcdf_attrs["_FillValue"] = self.implementation.get_data( @@ -296,6 +297,14 @@ def _write_attributes(self, parent, ncvar, extra=None, omit=()): return netcdf_attrs + def _aaa(self, ncvar, attributes): + """TODOZARR""" + g = self.write_vars + if g['netCDF']: + g["nc"][ncvar].setncatts(attributes) + elif g["zarr"]: + g["nc"][ncvar].update_attributes(attributes) + def _character_array(self, array): """Converts a numpy array of strings to character data type. @@ -1196,7 +1205,9 @@ def _write_geometry_container(self, field, geometry_container): if not g["dry_run"]: self._createVariable(**kwargs) - g["nc"][ncvar].setncatts(geometry_container) + # TODOZARR + #g["nc"][ncvar].setncatts(geometry_container) + self._aaa(ncvar, geometry_container) # Update the 'geometry_containers' dictionary g["geometry_containers"][ncvar] = geometry_container @@ -2490,7 +2501,7 @@ def _create_external( return external - def _createVariable(self, **kwargs): + def _createVariable(self, kwargs): """Create a variable in the netCDF file. .. versionadded:: (cfdm) 1.7.0 @@ -2498,8 +2509,24 @@ def _createVariable(self, **kwargs): """ g = self.write_vars ncvar = kwargs["varname"] - g["nc"][ncvar] = g["netcdf"].createVariable(**kwargs) - + + if g["netCDF"]: + g["nc"][ncvar] = g["netcdf"].createVariable(**kwargs) + elif g["zarr"]: + # Convert netCDF4.createVariable kwargs to zarr_array + # kwargs + zarr_kwargs = {"name": ncvar, + "shape": Date: Fri, 22 Aug 2025 18:28:20 +0100 Subject: [PATCH 02/39] dev --- cfdm/read_write/netcdf/netcdfwrite.py | 880 ++++++++++++++------------ cfdm/read_write/write.py | 2 +- 2 files changed, 491 insertions(+), 391 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 7982c37d2..3140ab31b 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -30,7 +30,7 @@ class AggregationError(Exception): - """An error relating to CF-netCDF aggregation. + """An error relating to CF aggregation. .. versionadded:: (cfdm) 1.12.0.0 @@ -40,7 +40,12 @@ class AggregationError(Exception): class NetCDFWrite(IOWrite): - """A container for writing Fields to a netCDF dataset.""" + """A container for writing Fields to a dataset. + + Both netCDF and Zarr output formats are supported (despite the + name of the class!). + + """ def __new__(cls, *args, **kwargs): """Store the NetCDFRead class.""" @@ -73,28 +78,33 @@ def cf_cell_method_qualifiers(self): """Cell method qualifiers.""" return set(("within", "where", "over", "interval", "comment")) - def _create_netcdf_group(self, nc, group_name): - """Creates a new netCDF4 group object. + def _createGroup(self, parent, group_name): + """Creates a new dataset group object. .. versionadded:: (cfdm) 1.8.6.0 :Parameters: - nc: `netCDF4._netCDF4.Group` or `netCDF4.Dataset` - + parent: `netCDF4.Dateset` or `netCDF4.Group` or `Zarr.Group` + group_name: `str` The name of the group. :Returns: - `netCDF4._netCDF4.Group` + `netCDF4.Group` or `Zarr.Group` The new group object. """ - return nc.createGroup(group_name) + backend = self.write_vars['backend'] + if backend == 'netCDF4': + return parent.createGroup(group_name) + + if backend == 'zarr': + return parent.create_group(group_name) - def _create_netcdf_variable_name(self, parent, default): - """Create an appropriate name for a netCDF variable. + def _create_variable_name(self, parent, default): + """Create an appropriate name for a dataset variable. .. versionadded:: (cfdm) 1.7.0 @@ -107,7 +117,7 @@ def _create_netcdf_variable_name(self, parent, default): :Returns: `str` - The netCDF variable name. + The dataset variable name. """ ncvar = self.implementation.nc_get_variable(parent, None) @@ -124,10 +134,10 @@ def _create_netcdf_variable_name(self, parent, default): # structure from the name. ncvar = self._remove_group_structure(ncvar) - return self._netcdf_name(ncvar) + return self._name(ncvar) - def _netcdf_name(self, base, dimsize=None, role=None): - """Return a new netCDF variable or dimension name. + def _name(self, base, dimsize=None, role=None): + """Return a new variable or dimension name for the dataset. .. versionadded:: (cfdm) 1.7.0 @@ -142,7 +152,7 @@ def _netcdf_name(self, base, dimsize=None, role=None): :Returns: `str` - NetCDF dimension name or netCDF variable name. + The name of the new dimension or variable. """ if base is None: @@ -161,7 +171,7 @@ def _netcdf_name(self, base, dimsize=None, role=None): for ncdim in g["dimensions_with_role"].get(role, ()): if g["ncdim_to_size"][ncdim] == dimsize: - # Return the name of an existing netCDF dimension + # Return the name of an existing dataset dimension # with this name, this size, and matching the # given role. return ncdim @@ -215,8 +225,8 @@ def _numpy_compressed(self, array): return array.flatten() - def _write_attributes(self, parent, ncvar, extra=None, omit=()): - """Write netCDF attributes to the netCDF file. + def _write_variable_attributes(self, parent, ncvar, extra=None, omit=()): + """Write variable attributes to the dataset. :Parameters: @@ -287,7 +297,7 @@ def _write_attributes(self, parent, ncvar, extra=None, omit=()): if not g["dry_run"]: # TODOZARR - self._aaa(ncvar, netcdf_attrs) + self._set_attributes(netcdf_attrs, ncvar) if skip_set_fill_value: # Re-add as known attribute since this FV is already set @@ -297,13 +307,23 @@ def _write_attributes(self, parent, ncvar, extra=None, omit=()): return netcdf_attrs - def _aaa(self, ncvar, attributes): + def _set_attributes(self, attributes, ncvar=None, group=None): """TODOZARR""" g = self.write_vars - if g['netCDF']: - g["nc"][ncvar].setncatts(attributes) - elif g["zarr"]: - g["nc"][ncvar].update_attributes(attributes) + if ncvar is not None: + # Set variable attributes + x = g["nc"][ncvar] + elif group is not None: + # Set group-level attributes + x = group + else: + raise ValueError("Must set ncvar or group") + + match g["backend"]: + case "netCDF": + x.setncatts(attributes) + case "zarr": + x.update_attributes(attributes) def _character_array(self, array): """Converts a numpy array of strings to character data type. @@ -396,7 +416,7 @@ def _datatype(self, variable): :Returns: `str` or str - The `netCDF4.createVariable` data type corresponding to the + The `_createVariable` data type corresponding to the datatype of the array of the input variable. """ @@ -423,7 +443,7 @@ def _datatype(self, variable): return f"{dtype.kind}{dtype.itemsize}" def _string_length_dimension(self, size): - """Creates a netCDF dimension for string variables if necessary. + """Creates a dataset dimension for string variables if necessary. :Parameters: @@ -432,7 +452,7 @@ def _string_length_dimension(self, size): :Returns: `str` - The netCDF dimension name. + The dataset dimension name. """ g = self.write_vars @@ -440,7 +460,7 @@ def _string_length_dimension(self, size): # ------------------------------------------------------------ # Create a new dimension for the maximum string length # ------------------------------------------------------------ - ncdim = self._netcdf_name( + ncdim = self._name( f"strlen{size}", dimsize=size, role="string_length" ) @@ -449,19 +469,32 @@ def _string_length_dimension(self, size): g["ncdim_to_size"][ncdim] = size # Define (and create if necessary) the group in which to - # place this netCDF dimension. + # place this dataset dimension. parent_group = self._parent_group(ncdim) if not g["dry_run"]: try: - parent_group.createDimension(ncdim, size) +# parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) except RuntimeError: pass # TODO convert to 'raise' via fixes upstream return ncdim - def _netcdf_dimensions(self, field, key, construct): - """Returns the netCDF dimension names for the construct axes. + def _createDimension(self, group, ncdim, size): + """TODOZARR + + """ + match self.write_vars['backend']: + case 'netCDF4': + group.createDimension(ncdim, size) + case 'zarr': + # Dimensions to not need to be created in Zarr + # datasets + pass + + def _dataset_dimensions(self, field, key, construct): + """Returns the dataset dimension names for the construct. The names are returned in a tuple. If the metadata construct has no data, then `None` is returned. @@ -479,7 +512,7 @@ def _netcdf_dimensions(self, field, key, construct): :Returns: `tuple` or `None` - The netCDF dimension names, or `None` if there are no + The dataset dimension names, or `None` if there are no data. """ @@ -514,7 +547,7 @@ def _netcdf_dimensions(self, field, key, construct): # ---------------------------------------------------- if sample_ncdim is None: # The list variable has not yet been written to - # the file, so write it and also get the netCDF + # the file, so write it and also get the dataset # name of the sample dimension. list_variable = self.implementation.get_list(construct) sample_ncdim = self._write_list_variable( @@ -533,7 +566,7 @@ def _netcdf_dimensions(self, field, key, construct): # has already been written to the file, ii) we already # have the position of the sample dimension in the # compressed array, and iii) we already have the - # netCDF name of the sample dimension. + # dataset name of the sample dimension. # ---------------------------------------------------- pass @@ -545,7 +578,7 @@ def _netcdf_dimensions(self, field, key, construct): # has already been written to the file, ii) we already # have the position of the sample dimension in the # compressed array, and iii) we already have the - # netCDF name of the sample dimension. + # dataset name of the sample dimension. # ---------------------------------------------------- pass elif compression_type == "ragged indexed contiguous": @@ -566,12 +599,12 @@ def _netcdf_dimensions(self, field, key, construct): def _write_dimension( self, ncdim, f, axis=None, unlimited=False, size=None ): - """Write a netCDF dimension to the file. + """Write a dimension to the dataset. :Parameters: ncdim: `str` - The netCDF dimension name. + The dataset dimension name. f: `Field` or `Domain` @@ -592,10 +625,14 @@ def _write_dimension( """ g = self.write_vars + if g['backend'] == 'zarr': + # Dimensions don't get written to Zarr datasets + return + if axis is not None: domain_axis = self.implementation.get_domain_axes(f)[axis] logger.info( - f" Writing {domain_axis!r} to netCDF dimension: {ncdim}" + f" Writing {domain_axis!r} to dimension: {ncdim}" ) # pragma: no cover size = self.implementation.get_domain_axis_size(f, axis) @@ -604,7 +641,7 @@ def _write_dimension( g["ncdim_to_size"][ncdim] = size # Define (and create if necessary) the group in which to place - # this netCDF dimension. + # this dataset dimension. parent_group = self._parent_group(ncdim) if g["group"] and "/" in ncdim: @@ -650,8 +687,8 @@ def _write_dimension( def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): """Writes a coordinate variable and its bounds variable to file. - This also writes a new netCDF dimension to the file and, if - required, a new netCDF dimension for the bounds. + For netCDF datasets, this also writes a new dimension to the + file and, if required, a new dimension for the bounds. :Parameters: @@ -662,10 +699,11 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): coord: Dimension coordinate construct ncdim: `str` or `None` - The name of the netCDF dimension for this dimension - coordinate construct, including any groups structure. Note - that the group structure may be different to the - coordinate variable, and the basename. + The name of the dataset dimension for this dimension + coordinate construct, including any groups + structure. Note that the group structure may be + different to the coordinate variable, and the + basename. coordinates: `list` This list may get updated in-place. @@ -675,7 +713,7 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): :Returns: `str` - The netCDF name of the dimension coordinate. + The dataset name of the dimension coordinate. """ g = self.write_vars @@ -699,11 +737,11 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): create = True # If the dimension coordinate is already in the file but not - # in an approriate group then we have to create a new netCDF + # in an approriate group then we have to create a new dataset # variable. This is to prevent a downstream error ocurring # when the parent data variable tries to reference one of its - # netCDF dimensions that is not in the same group nor a parent - # group. + # dataset dimensions that is not in the same group nor a + # parent group. if already_in_file and not create: ncvar = coord.nc_get_variable("") groups = self._groups(seen[id(coord)]["ncvar"]) @@ -711,35 +749,35 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): create = True if create: - ncvar = self._create_netcdf_variable_name(coord, default=None) + ncvar = self._create_variable_name(coord, default=None) if ncvar is None: - # No netCDF variable name has been set, so use the - # corresponding netCDF dimension name + # No dataset variable name has been set, so use the + # corresponding dataset dimension name ncvar = ncdim if ncvar is None: - # No netCDF variable name not correponding to a netCDF - # dimension name has been set, so create a default - # netCDF variable name. - ncvar = self._create_netcdf_variable_name( + # No dataset variable name not correponding to a + # dataset dimension name has been set, so create a + # default dataset variable name. + ncvar = self._create_variable_name( coord, default="coordinate" ) ncdim = ncvar - # Create a new dimension + # Create a new dataset dimension (null-op for Zarr) unlimited = self._unlimited(f, axis) self._write_dimension(ncdim, f, axis, unlimited=unlimited) - ncdimensions = self._netcdf_dimensions(f, key, coord) + ncdimensions = self._dataset_dimensions(f, key, coord) # If this dimension coordinate has bounds then write the - # bounds to the netCDF file and add the 'bounds' or + # bounds to the dataset and add the 'bounds' or # 'climatology' attribute (as appropriate) to a dictionary # of extra attributes extra = self._write_bounds(f, coord, key, ncdimensions, ncvar) - # Create a new dimension coordinate variable + # Create a new dimension coordinate dataset variable self._write_netcdf_variable( ncvar, ncdimensions, @@ -756,8 +794,8 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): g["axis_to_ncdim"][axis] = seen[id(coord)]["ncdims"][0] if g["coordinates"] and ncvar is not None: - # Add the dimension coordinate netCDF variable name to the - # 'coordinates' attribute + # Add the dimension coordinate dataset variable name to + # the 'coordinates' attribute coordinates.append(ncvar) return ncvar @@ -765,16 +803,16 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): def _write_count_variable( self, f, count_variable, ncdim=None, create_ncdim=True ): - """Write a count variable to the netCDF file.""" + """Write a count variable to the dataset.""" g = self.write_vars if not self._already_in_file(count_variable): - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( count_variable, default="count" ) if create_ncdim: - ncdim = self._netcdf_name(ncdim) + ncdim = self._name(ncdim) self._write_dimension( ncdim, f, @@ -788,7 +826,7 @@ def _write_count_variable( _ = self.implementation.nc_get_sample_dimension( count_variable, "element" ) - sample_ncdim = self._netcdf_name(_) + sample_ncdim = self._name(_) self._write_dimension( sample_ncdim, f, @@ -819,7 +857,7 @@ def _write_index_variable( create_ncdim=True, instance_dimension=None, ): - """Write an index variable to the netCDF file. + """Write an index variable to the dataset. :Parameters: @@ -828,30 +866,30 @@ def _write_index_variable( index_variable: Index variable sample_dimension: `str` - The name of the netCDF sample dimension. + The name of the dataset sample dimension. ncdim: `str`, optional create_ncdim: bool, optional instance_dimension: `str`, optional - The name of the netCDF instance dimension. + The name of the dataset instance dimension. :Returns: `str` - The name of the netCDF sample dimension. + The name of the dataset sample dimension. """ g = self.write_vars if not self._already_in_file(index_variable): - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( index_variable, default="index" ) if create_ncdim: - ncdim = self._netcdf_name(ncdim) + ncdim = self._name(ncdim) self._write_dimension( ncdim, f, @@ -872,13 +910,13 @@ def _write_index_variable( return sample_dimension def _write_list_variable(self, f, list_variable, compress): - """Write a list variable to the netCDF file.""" + """Write a list variable to the dataset.""" g = self.write_vars create = not self._already_in_file(list_variable) if create: - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( list_variable, default="list" ) @@ -901,10 +939,10 @@ def _write_list_variable(self, f, list_variable, compress): return ncvar def _write_scalar_data(self, f, value, ncvar): - """Write a dimension coordinate and bounds to the netCDF file. + """Write a dimension coordinate and bounds to the dataset. - This also writes a new netCDF dimension to the file and, if - required, a new netCDF bounds dimension. + For netCDF datasets, this also writes a new dimension to the + file and, if required, a new bounds dimension. .. note:: This function updates ``g['seen']``. @@ -917,7 +955,7 @@ def _write_scalar_data(self, f, value, ncvar): :Returns: `str` - The netCDF name of the scalar data variable + The dataset name of the scalar data variable """ g = self.write_vars @@ -927,7 +965,7 @@ def _write_scalar_data(self, f, value, ncvar): create = not self._already_in_file(value, ncdims=()) if create: - ncvar = self._netcdf_name(ncvar) # DCH ? + ncvar = self._name(ncvar) # DCH ? # Create a new dimension coordinate variable self._write_netcdf_variable(ncvar, (), value, None) @@ -937,7 +975,7 @@ def _write_scalar_data(self, f, value, ncvar): return ncvar def _create_geometry_container(self, field): - """Create a geometry container variable in the netCDF file. + """Create a geometry container variable in the dataset. .. versionadded:: (cfdm) 1.8.0 @@ -948,7 +986,7 @@ def _create_geometry_container(self, field): :Returns: `dict` - A representation off the CF-netCDF geometry container + A representation off the CF geometry container variable for field construct. If there is no geometry container then the dictionary is empty. @@ -985,7 +1023,7 @@ def _create_geometry_container(self, field): try: coord_ncvar = g["seen"][id(coord)]["ncvar"] except KeyError: - # There is no netCDF auxiliary coordinate variable + # There is no auxiliary coordinate dataset variable pass else: gc[geometry_id].setdefault("coordinates", []).append( @@ -1106,13 +1144,13 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): Specifically, returns True if a variable is logically equal any variable in the g['seen'] dictionary. - If this is the case then the variable has already been written to - the output netCDF file and so we don't need to do it again. + If this is the case then the variable has already been written + to the output dataset and so we don't need to do it again. - If 'ncdims' is set then a extra condition for equality is applied, - namely that of 'ncdims' being equal to the netCDF dimensions - (names and order) to that of a variable in the g['seen'] - dictionary. + If 'ncdims' is set then a extra condition for equality is + applied, namely that of 'ncdims' being equal to the dataset + dimensions (names and order) to that of a variable in the + g['seen'] dictionary. When `True` is returned, the input variable is added to the g['seen'] dictionary. @@ -1147,9 +1185,9 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): for value in seen.values(): if ncdims is not None and ncdims != value["ncdims"]: - # The netCDF dimensions (names and order) of the input - # variable are different to those of this variable in - # the 'seen' dictionary + # The dataset dimensions (names and order) of the + # input variable are different to those of this + # variable in the 'seen' dictionary continue # Still here? @@ -1166,14 +1204,14 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): return False def _write_geometry_container(self, field, geometry_container): - """Write a netCDF geometry container variable. + """Write a geometry container variable to the dataset. .. versionadded:: (cfdm) 1.8.0 :Returns: `str` - The netCDF variable name for the geometry container. + The dataset variable name for the geometry container. """ g = self.write_vars @@ -1187,7 +1225,7 @@ def _write_geometry_container(self, field, geometry_container): ncvar = self.implementation.nc_get_geometry_variable( field, default="geometry_container" ) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) logger.info( f" Writing geometry container variable: {ncvar}" @@ -1207,7 +1245,7 @@ def _write_geometry_container(self, field, geometry_container): # TODOZARR #g["nc"][ncvar].setncatts(geometry_container) - self._aaa(ncvar, geometry_container) + self._set_attributes(ncvar, geometry_container) # Update the 'geometry_containers' dictionary g["geometry_containers"][ncvar] = geometry_container @@ -1217,11 +1255,10 @@ def _write_geometry_container(self, field, geometry_container): def _write_bounds( self, f, coord, coord_key, coord_ncdimensions, coord_ncvar=None ): - """Creates a bounds netCDF variable and returns its name. + """Creates a bounds dataset variable. - Specifically, creates a bounds netCDF variable, creating a new - bounds netCDF dimension if required. Returns the bounds - variable's netCDF variable name. + For netCDF datasets, also creates a new bounds dimension if + required. .. versionadded:: (cfdm) 1.7.0 @@ -1235,11 +1272,12 @@ def _write_bounds( The coordinate construct key. coord_ncdimensions: `tuple` of `str` - The ordered netCDF dimension names of the coordinate's - dimensions (which do not include the bounds dimension). + The ordered dataset dimension names of the + coordinate's dimensions (which do not include the + bounds dimension). coord_ncvar: `str` - The netCDF variable name of the parent variable + The datset variable name of the parent variable :Returns: @@ -1285,7 +1323,7 @@ def _write_bounds( size = data.shape[-1] - # bounds_ncdim = self._netcdf_name('bounds{0}'.format(size), + # bounds_ncdim = self._name('bounds{0}'.format(size), # dimsize=size, role='bounds') bounds_ncdim = self.implementation.nc_get_dimension( @@ -1296,7 +1334,7 @@ def _write_bounds( # structure from the name. bounds_ncdim = self._remove_group_structure(bounds_ncdim) - bounds_ncdim = self._netcdf_name( + bounds_ncdim = self._name( bounds_ncdim, dimsize=size, role="bounds" ) @@ -1313,14 +1351,14 @@ def _write_bounds( ncdim_to_size = g["ncdim_to_size"] if bounds_ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} netCDF dimension for " + f" Writing size {size} dimension for " f"bounds: {bounds_ncdim}" ) # pragma: no cover ncdim_to_size[bounds_ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(bounds_ncdim) if g["group"] and "/" in bounds_ncdim: @@ -1338,7 +1376,7 @@ def _write_bounds( except RuntimeError: raise - # Set the netCDF bounds variable name + # Set the bounds dataset variable name default = coord_ncvar + "_bounds" else: default = "bounds" @@ -1352,7 +1390,7 @@ def _write_bounds( # group structure from the name (for now). ncvar = self._remove_group_structure(ncvar) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) # If no groups have been set on the bounds, then put the # bounds variable in the same group as its parent @@ -1371,7 +1409,7 @@ def _write_bounds( if self.implementation.has_property(coord, prop): omit.append(prop) - # Create the bounds netCDF variable + # Create the bounds dataset variable self._write_netcdf_variable( ncvar, ncdimensions, @@ -1399,14 +1437,14 @@ def _write_bounds( def _write_node_coordinates( self, f, coord, coord_ncvar, coord_ncdimensions ): - """Create a netCDF node coordinates variable. + """Create a node coordinates dataset variable. This will create: - * A netCDF node dimension, if required. - * A netCDF node count variable, if required. - * A netCDF part node count variable, if required. - * A netCDF interior ring variable, if required. + * A dataset node dimension, if required. + * A dataset node count variable, if required. + * A dataset part node count variable, if required. + * A dataset interior ring variable, if required. .. versionadded:: (cfdm) 1.8.0 @@ -1461,10 +1499,10 @@ def _write_node_coordinates( nodes, inherited_properties ) - # Find the base of the netCDF part dimension name + # Find the base of the 'part' dataset dimension name size = self.implementation.get_data_size(nodes) ncdim = self._get_node_ncdimension(nodes, default="node") - ncdim = self._netcdf_name(ncdim, dimsize=size, role="node") + ncdim = self._name(ncdim, dimsize=size, role="node") create = True if self._already_in_file(nodes, (ncdim,)): @@ -1504,13 +1542,13 @@ def _write_node_coordinates( if ncdim not in ncdim_to_size: size = self.implementation.get_data_size(nodes) logger.info( - f" Writing size {size} netCDF node dimension: {ncdim}" + f" Writing size {size} node dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(ncdim) if g["group"] and "/" in ncdim: @@ -1521,7 +1559,7 @@ def _write_node_coordinates( if not g["dry_run"]: parent_group.createDimension(ncdim, size) - # Set an appropriate default netCDF node coordinates + # Set an appropriate default node coordinates dataset # variable name axis = self.implementation.get_property(bounds, "axis") if axis is not None: @@ -1537,9 +1575,9 @@ def _write_node_coordinates( # group structure from the name. ncvar = self._remove_group_structure(ncvar) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF node coordinates variable + # Create the node coordinates dataset variable self._write_netcdf_variable( ncvar, (ncdim,), @@ -1584,7 +1622,7 @@ def _write_node_coordinates( def _write_node_count( self, f, coord, bounds, coord_ncdimensions, encodings ): - """Create a netCDF node count variable. + """Create a node count dataset variable. .. versionadded:: (cfdm) 1.8.0 @@ -1595,7 +1633,7 @@ def _write_node_count( bounds: coord_ncdimensions: sequence of `str` - The netCDF instance dimension + The dataset instance dimension encodings: `dict` Ignored. @@ -1627,7 +1665,7 @@ def _write_node_count( count = self.implementation.initialise_Count() self.implementation.set_data(count, data, copy=False) - # Find the base of the netCDF node count variable name + # Find the base of the node count dataset variable name nc = self.implementation.get_node_count(coord) if nc is not None: @@ -1658,12 +1696,12 @@ def _write_node_count( # created, so create it now. if geometry_dimension not in g["ncdim_to_size"]: raise ValueError( - "The netCDF geometry dimension should already exist ..." + "The dataset geometry dimension should already exist ..." ) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF node count variable + # Create the node count dataset variable self._write_netcdf_variable( ncvar, (geometry_dimension,), count, None ) @@ -1674,27 +1712,27 @@ def _write_node_count( def _get_part_ncdimension(self, coord, default=None): """Gets dimension name for part node counts or interior rings. - Specifically, gets the base of the netCDF dimension for part + Specifically, gets the base of the dataset dimension for part node count and interior ring variables. .. versionadded:: (cfdm) 1.8.0 :Returns: - The netCDF dimension name, or else the value of the *default* - parameter. + The dataset dimension name, or else the value of the + *default* parameter. """ ncdim = None pnc = self.implementation.get_part_node_count(coord) if pnc is not None: - # Try to get the netCDF dimension from a part node count + # Try to get the dataset dimension from a part node count # variable ncdim = self.implementation.nc_get_dimension(pnc, default=None) if ncdim is None: - # Try to get the netCDF dimension from an interior ring + # Try to get the dataset dimension from an interior ring # variable interior_ring = self.implementation.get_interior_ring(coord) if interior_ring is not None: @@ -1703,7 +1741,7 @@ def _get_part_ncdimension(self, coord, default=None): ) if ncdim is not None: - # Found a netCDF dimension + # Found a dataset dimension if not self.write_vars["group"]: # A flat file has been requested, so strip off any # group structure from the name. @@ -1725,27 +1763,27 @@ def _parent_group(self, name): :Parameters: name: `str` - The name of the netCDF dimension or variable. + The name of the dataset dimension or variable. :Returns: - `netCDF.Dataset` or `netCDF._netCDF4.Group` + `netCDF.Dataset` or `netCDF.Group` or `zarr.Group` """ g = self.write_vars - parent_group = g["netcdf"] + parent_group = g["dataset"] if not g["group"] or "/" not in name: return parent_group if not name.startswith("/"): raise ValueError( - f"Invalid netCDF name {name!r}: missing a leading '/'" + f"Invalid dataset name {name!r}: missing a leading '/'" ) for group_name in name.split("/")[1:-1]: - parent_group = self._write_group(parent_group, group_name) + parent_group = self._createGroup(parent_group, group_name) return parent_group @@ -1820,7 +1858,7 @@ def _groups(self, name): return groups def _get_node_ncdimension(self, bounds, default=None): - """Get the netCDF dimension from a node count variable. + """Get the dataset dimension from a node count variable. .. versionadded:: (cfdm) 1.8.0 @@ -1832,13 +1870,13 @@ def _get_node_ncdimension(self, bounds, default=None): :Returns: - The netCDF dimension name, or else the value of the *default* + The dimension name, or else the value of the *default* parameter. """ ncdim = self.implementation.nc_get_dimension(bounds, default=None) if ncdim is not None: - # Found a netCDF dimension + # Found a dimension if not self.write_vars["group"]: # A flat file has been requested, so strip off any # group structure from the name. @@ -1850,11 +1888,7 @@ def _get_node_ncdimension(self, bounds, default=None): return default def _write_part_node_count(self, f, coord, bounds, encodings): - """Creates a bounds netCDF variable and returns its name. - - Create a bounds netCDF variable, creating a new bounds netCDF - dimension if required. Return the bounds variable's netCDF - variable name. + """Creates a part node count variable and returns its name. .. versionadded:: (cfdm) 1.8.0 @@ -1862,9 +1896,6 @@ def _write_part_node_count(self, f, coord, bounds, encodings): coord: - coord_ncvar: `str` - The netCDF variable name of the parent variable - :Returns: `dict` @@ -1900,7 +1931,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): count = self.implementation.initialise_Count() self.implementation.set_data(count, data, copy=False) - # Find the base of the netCDF part_node_count variable name + # Find the base of the dataset part_node_count variable name pnc = self.implementation.get_part_node_count(coord) if pnc is not None: ncvar = self.implementation.nc_get_variable( @@ -1918,7 +1949,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): else: ncvar = "part_node_count" - # Find the base of the netCDF part dimension name + # Find the base of the dataset part dimension name size = self.implementation.get_data_size(count) if g["part_ncdim"] is not None: ncdim = g["part_ncdim"] @@ -1926,7 +1957,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): ncdim = encodings["part_ncdim"] else: ncdim = self._get_part_ncdimension(coord, default="part") - ncdim = self._netcdf_name(ncdim, dimsize=size, role="part") + ncdim = self._name(ncdim, dimsize=size, role="part") if self._already_in_file(count, (ncdim,)): # This part node count variable has been previously @@ -1936,13 +1967,13 @@ def _write_part_node_count(self, f, coord, bounds, encodings): ncdim_to_size = g["ncdim_to_size"] if ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} netCDF part " f"dimension{ncdim}" + f" Writing size {size} part " f"dimension{ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(ncdim) if g["group"] and "/" in ncdim: @@ -1953,9 +1984,9 @@ def _write_part_node_count(self, f, coord, bounds, encodings): if not g["dry_run"]: parent_group.createDimension(ncdim, size) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF part_node_count variable + # Create the dataset part_node_count variable self._write_netcdf_variable(ncvar, (ncdim,), count, None) g["part_ncdim"] = ncdim @@ -1964,7 +1995,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): return {"part_node_count": ncvar, "part_ncdim": ncdim} def _write_interior_ring(self, f, coord, bounds, encodings): - """Write an interior ring variable to the netCDF file. + """Write an interior ring variable to the dataset. .. versionadded:: (cfdm) 1.8.0 @@ -1973,7 +2004,7 @@ def _write_interior_ring(self, f, coord, bounds, encodings): coord: coord_ncvar: `str` - The netCDF variable name of the parent variable + The dataset variable name of the parent variable encodings: @@ -2014,7 +2045,7 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ncdim = encodings["part_ncdim"] else: ncdim = self._get_part_ncdimension(coord, default="part") - ncdim = self._netcdf_name(ncdim, dimsize=size, role="part") + ncdim = self._name(ncdim, dimsize=size, role="part") if self._already_in_file(interior_ring, (ncdim,)): # This interior ring variable has been previously created, @@ -2024,12 +2055,12 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ncdim_to_size = g["ncdim_to_size"] if ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} netCDF part " f"dimension{ncdim}" + f" Writing size {size} part " f"dimension{ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size # Define (and create if necessary) the group in which - # to place this netCDF dimension. + # to place this dataset dimension. parent_group = self._parent_group(ncdim) if g["group"] and "/" in ncdim: @@ -2040,9 +2071,9 @@ def _write_interior_ring(self, f, coord, bounds, encodings): if not g["dry_run"]: parent_group.createDimension(ncdim, size) - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) - # Create the netCDF interior ring variable + # Create the dataset interior ring variable self._write_netcdf_variable( ncvar, (ncdim,), @@ -2059,13 +2090,13 @@ def _write_interior_ring(self, f, coord, bounds, encodings): def _write_scalar_coordinate( self, f, key, coord_1d, axis, coordinates, extra=None ): - """Write a scalar coordinate and its bounds to the netCDF file. + """Write a scalar coordinate and its bounds to the dataset. - It is assumed that the input coordinate is has size 1, but this is not - checked. + It is assumed that the input coordinate is has size 1, but + this is not checked. - If an equal scalar coordinate has already been written to the file - then the input coordinate is not written. + If an equal scalar coordinate has already been written to the + file then the input coordinate is not written. :Parameters: @@ -2083,7 +2114,8 @@ def _write_scalar_coordinate( :Returns: coordinates: `list` - The updated list of netCDF auxiliary coordinate names. + The updated list of auxiliary coordinate dataset + variable names. """ # To avoid mutable default argument (an anti-pattern) of extra={} @@ -2097,11 +2129,11 @@ def _write_scalar_coordinate( scalar_coord = self.implementation.squeeze(coord_1d, axes=0) if not self._already_in_file(scalar_coord, ()): - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( scalar_coord, default="scalar" ) # If this scalar coordinate has bounds then create the - # bounds netCDF variable and add the 'bounds' or + # bounds dataset variable and add the 'bounds' or # 'climatology' (as appropriate) attribute to the # dictionary of extra attributes bounds_extra = self._write_bounds(f, scalar_coord, key, (), ncvar) @@ -2129,10 +2161,10 @@ def _write_scalar_coordinate( return coordinates def _write_auxiliary_coordinate(self, f, key, coord, coordinates): - """Write auxiliary coordinates and bounds to the netCDF file. + """Write auxiliary coordinates and bounds to the dataset. - If an equal auxiliary coordinate has already been written to the file - then the input coordinate is not written. + If an equal auxiliary coordinate has already been written to + the file then the input coordinate is not written. :Parameters: @@ -2149,16 +2181,16 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): :Returns: `list` - The list of netCDF auxiliary coordinate names updated in - place. + The list of auxiliary coordinate dataset variable + names updated in place. """ g = self.write_vars ncvar = None - # The netCDF dimensions for the auxiliary coordinate variable - ncdimensions = self._netcdf_dimensions(f, key, coord) + # The dataset dimensions for the auxiliary coordinate variable + ncdimensions = self._dataset_dimensions(f, key, coord) coord = self._change_reference_datetime(coord) @@ -2194,14 +2226,14 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): f, coord, key, ncdimensions, coord_ncvar=None ) else: - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( coord, default="auxiliary" ) # TODO: move setting of bounds ncvar to here - why? # If this auxiliary coordinate has bounds then create - # the bounds netCDF variable and add the 'bounds', + # the bounds dataset variable and add the 'bounds', # 'climatology' or 'nodes' attribute (as appropriate) # to the dictionary of extra attributes. extra = self._write_bounds(f, coord, key, ncdimensions, ncvar) @@ -2225,7 +2257,7 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): return coordinates def _write_domain_ancillary(self, f, key, anc): - """Write a domain ancillary and its bounds to the netCDF file. + """Write a domain ancillary and its bounds to the dataset. If an equal domain ancillary has already been written to the file athen it is not re-written. @@ -2244,7 +2276,7 @@ def _write_domain_ancillary(self, f, key, anc): :Returns: `str` - The netCDF variable name of the domain ancillary variable. + The dataset name of the domain ancillary variable. """ g = self.write_vars @@ -2252,19 +2284,19 @@ def _write_domain_ancillary(self, f, key, anc): if g["post_dry_run"]: logger.warning( "At present domain ancillary constructs of appended fields " - "may not be handled correctly by netCDF write append mode " + "may not be handled correctly by write append mode " "and can appear as extra fields. Set them on fields using " "`set_domain_ancillary` and similar methods if required." ) - ncdimensions = self._netcdf_dimensions(f, key, anc) + ncdimensions = self._dataset_dimensions(f, key, anc) create = not self._already_in_file(anc, ncdimensions, ignore_type=True) if not create: ncvar = g["seen"][id(anc)]["ncvar"] else: - # See if we can set the default netCDF variable name to + # See if we can set the default dataset variable name to # its formula_terms term default = None for ref in self.implementation.get_coordinate_references( @@ -2286,10 +2318,10 @@ def _write_domain_ancillary(self, f, key, anc): if default is None: default = "domain_ancillary" - ncvar = self._create_netcdf_variable_name(anc, default=default) + ncvar = self._create_variable_name(anc, default=default) - # If this domain ancillary has bounds then create the bounds - # netCDF variable + # If this domain ancillary has bounds then create the + # bounds dataset variable self._write_bounds(f, anc, key, ncdimensions, ncvar) # Create a new domain ancillary variable @@ -2311,10 +2343,10 @@ def _write_field_ancillary( key, anc, ): - """Write a field ancillary to the netCDF file. + """Write a field ancillary to the dataset. - If an equal field ancillary has already been written to the file - then it is not re-written. + If an equal field ancillary has already been written to the + file then it is not re-written. :Parameters: @@ -2327,7 +2359,7 @@ def _write_field_ancillary( :Returns: `str` - The netCDF variable name of the field ancillary + The dataset variable name of the field ancillary object. If no ancillary variable was written then an empty string is returned. @@ -2338,14 +2370,14 @@ def _write_field_ancillary( """ g = self.write_vars - ncdimensions = self._netcdf_dimensions(f, key, anc) + ncdimensions = self._dataset_dimensions(f, key, anc) create = not self._already_in_file(anc, ncdimensions) if not create: ncvar = g["seen"][id(anc)]["ncvar"] else: - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( anc, default="ancillary_data" ) @@ -2363,10 +2395,10 @@ def _write_field_ancillary( return ncvar def _write_cell_measure(self, f, key, cell_measure): - """Write a cell measure construct to the netCDF file. + """Write a cell measure construct to the dataset. - If an identical construct has already in the file then the cell - measure will not be written. + If an identical construct has already in the file then the + cell measure will not be written. :Parameters: @@ -2390,11 +2422,11 @@ def _write_cell_measure(self, f, key, cell_measure): measure = self.implementation.get_measure(cell_measure) if measure is None: raise ValueError( - "Can't create a CF-netCDF cell measure variable " + "Can't create a CF cell measure variable " "without a 'measure' property" ) - ncdimensions = self._netcdf_dimensions(f, key, cell_measure) + ncdimensions = self._dataset_dimensions(f, key, cell_measure) if self._already_in_file(cell_measure, ncdimensions): # Use existing cell measure variable @@ -2406,8 +2438,8 @@ def _write_cell_measure(self, f, key, cell_measure): ) if ncvar is None: raise ValueError( - "Can't create an external CF-netCDF cell measure " - "variable without a netCDF variable name" + "Can't create an external CF cell measure " + "variable without a dataset variable name" ) # Add ncvar to the global external_variables attribute @@ -2426,7 +2458,7 @@ def _write_cell_measure(self, f, key, cell_measure): ncdimensions=ncdimensions, ) else: - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( cell_measure, default="cell_measure" ) @@ -2456,10 +2488,13 @@ def _set_external_variables(self, ncvar): if ncvar not in external_variables: external_variables.add(ncvar) if not g["dry_run"] and not g["post_dry_run"]: - g["netcdf"].setncattr( - "external_variables", " ".join(sorted(external_variables)) - ) - + self._set_attributes( + {"external_variables": " ".join(sorted(external_variables))} + ) +# g["dataset"].setncattr( +# "external_variables", " ".join(sorted(external_variables)) +# ) + def _create_external( self, field=None, construct_id=None, ncvar=None, ncdimensions=None ): @@ -2478,7 +2513,7 @@ def _create_external( field=field, construct_id=construct_id ) - # Set the correct netCDF variable and dimension names + # Set the correct dataset variable and dimension names self.implementation.nc_set_variable(external, ncvar) external_domain_axes = self.implementation.get_domain_axes(external) @@ -2501,34 +2536,40 @@ def _create_external( return external - def _createVariable(self, kwargs): - """Create a variable in the netCDF file. + def _createVariable(self, **kwargs): + """Create a variable in the dataset. .. versionadded:: (cfdm) 1.7.0 """ - g = self.write_vars + g = self.write_vars ncvar = kwargs["varname"] - - if g["netCDF"]: - g["nc"][ncvar] = g["netcdf"].createVariable(**kwargs) - elif g["zarr"]: - # Convert netCDF4.createVariable kwargs to zarr_array - # kwargs - zarr_kwargs = {"name": ncvar, - "shape": =1.8) # ------------------------------------------------------------ if g["group"]: @@ -2788,13 +2829,13 @@ def _write_netcdf_variable( ncdim_groups = self._groups(ncdim) if not groups.startswith(ncdim_groups): raise ValueError( - f"Can't create netCDF variable {ncvar!r} from " - f"{cfvar!r} with netCDF dimension {ncdim!r} that is " + f"Can't create variable {ncvar!r} from " + f"{cfvar!r} with dimension {ncdim!r} that is " "not in the same group nor in a parent group." ) # ------------------------------------------------------------ - # Replace netCDF dimension names with their basenames + # Replace dataset dimension names with their basenames # (CF>=1.8) # ------------------------------------------------------------ ncdimensions_basename = [ @@ -2802,7 +2843,7 @@ def _write_netcdf_variable( ] # ------------------------------------------------------------ - # Create a new netCDF variable + # Create a new dataset variable # ------------------------------------------------------------ kwargs = { "varname": ncvar, @@ -2851,6 +2892,12 @@ def _write_netcdf_variable( # per-variable quantization parameters, such as # "quantization_nsd"). if quantize_on_write: + if g["backend"] == "zarr": + raise NotImplementedError( + f"Can't yet quantize on write {cfvar!r} to a Zarr " + "dataset TODOZARR" + ) + # Set "implemention" to this version of the netCDF-C # library self.implementation.set_parameter( @@ -2979,19 +3026,19 @@ def _write_netcdf_variable( ) logger.info( - f" to netCDF variable: {ncvar}({', '.join(ncdimensions)})" + f" to variable: {ncvar}({', '.join(ncdimensions)})" ) # pragma: no cover # Adjust createVariable arguments for contiguous variables if kwargs["contiguous"]: - if g["netcdf"].data_model.startswith("NETCDF4"): + if g["dataset"].data_model.startswith("NETCDF4"): # NETCDF4 contiguous variables can't span unlimited # dimensions unlimited_dimensions = g["unlimited_dimensions"].intersection( kwargs["dimensions"] ) if unlimited_dimensions: - data_model = g["netcdf"].data_model + data_model = g["dataset"].data_model raise ValueError( f"Can't create variable {ncvar!r} in {data_model} " f"file from {cfvar!r}: In {data_model} it is not " @@ -3011,7 +3058,7 @@ def _write_netcdf_variable( message = ( f"Can't create variable in {g['netcdf'].data_model} file " f"from {cfvar!r}: {error}. " - f"netCDF4.createVariable arguments: {kwargs}" + f"_createVariable arguments: {kwargs}" ) if error == ( "NetCDF: Not a valid data type or _FillValue type mismatch" @@ -3033,14 +3080,14 @@ def _write_netcdf_variable( raise RuntimeError(message) # ------------------------------------------------------------ - # Write attributes to the netCDF variable + # Write attributes to the dataset variable # ------------------------------------------------------------ - attributes = self._write_attributes( + attributes = self._write_variable_attributes( cfvar, ncvar, extra=extra, omit=omit ) # ------------------------------------------------------------ - # Write data to the netCDF variable + # Write data to the dataset variable # # Note that we don't need to worry about scale_factor and # add_offset, since if a data array is *not* a numpy array, @@ -3080,7 +3127,7 @@ def _write_netcdf_variable( def _customise_createVariable( self, cfvar, construct_type, domain_axes, kwargs ): - """Customises `netCDF4.Dataset.createVariable` keywords. + """Customises `_createVariable` keywords. The keyword arguments may be changed in subclasses which override this method. @@ -3108,13 +3155,11 @@ def _customise_createVariable( `dict` Dictionary of keyword arguments to be passed to - `netCDF4.Dataset.createVariable`. + `_createVariable`. """ # This method is trivial but the intention is that subclasses - # will override it to perform any desired - # customisation. Notably see the equivalent method in - # cf-python which is non-trivial. + # may override it to perform any desired customisation. return kwargs def _transform_strings(self, data, ncdimensions): @@ -3133,8 +3178,12 @@ def _transform_strings(self, data, ncdimensions): `Data`, `tuple` """ + # TODOZARR - consider always writing string arrays in zarr (rather than char arrays) + datatype = self._datatype(data) + + if data is not None and datatype == "S1": # -------------------------------------------------------- # Convert a string data type numpy array into a character @@ -3167,7 +3216,7 @@ def _write_data( construct_type=None, cfa=None, ): - """Write a data array to the netCDF file. + """Write a data array to the dataset. :Parameters: @@ -3187,8 +3236,8 @@ def _write_data( unset_values: sequence of numbers attributes: `dict`, optional - The netCDF attributes for the constructs that have been - written to the file. + The dataset attributes for the constructs that have + been written to the file. construct_type: `str` The construct type of the *cfvar*, or its parent if @@ -3434,28 +3483,28 @@ def _write_field_or_domain( # axes that define the domain. CF-1.9 data_axes = list(self.implementation.get_domain_axes(f)) - # Mapping of domain axis identifiers to netCDF dimension + # Mapping of domain axis identifiers to dataset dimension # names. This gets reset for each new field/domain that is # written to the file. # # For example: {'domainaxis1': 'lon'} g["axis_to_ncdim"] = {} - # Mapping of domain axis identifiers to netCDF scalar + # Mapping of domain axis identifiers to dataset scalar # coordinate variable names. This gets reset for each new # field/domain that is written to the file. # # For example: {'domainaxis0': 'time'} g["axis_to_ncscalar"] = {} - # Mapping of construct internal identifiers to netCDF variable - # names. This gets reset for each new field/domain that is - # written to the file. + # Mapping of construct internal identifiers to dataset + # variable names. This gets reset for each new field/domain + # that is written to the file. # # For example: {'dimensioncoordinate1': 'longitude'} g["key_to_ncvar"] = {} - # Mapping of construct internal identifiers to their netCDF + # Mapping of construct internal identifiers to their dataset # dimensions. This gets reset for each new field/domain that # is written to the file. # @@ -3513,7 +3562,7 @@ def _write_field_or_domain( ugrid = self.implementation.has_domain_topology(f) if ugrid: raise NotImplementedError( - "Can't yet create UGRID cf-netCDF files. " + "Can't yet write UGRID datasets. " "This feature is coming soon ..." ) @@ -3692,7 +3741,7 @@ def _write_field_or_domain( data_axes.append(axis) # If the data array (now) spans this domain axis then - # create a netCDF dimension for it + # create a dataset dimension for it if axis in data_axes: axis_size0 = self.implementation.get_domain_axis_size( f, axis @@ -3759,7 +3808,7 @@ def _write_field_or_domain( and len(data_axes) == 2 and axis == data_axes[1] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][axis] = "ragged_contiguous_element" elif ( @@ -3767,7 +3816,7 @@ def _write_field_or_domain( and len(data_axes) == 2 and axis == data_axes[1] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][axis] = "ragged_indexed_element" elif ( @@ -3775,7 +3824,7 @@ def _write_field_or_domain( and len(data_axes) == 3 and axis == data_axes[1] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][ axis @@ -3785,7 +3834,7 @@ def _write_field_or_domain( and len(data_axes) == 3 and axis == data_axes[2] ): - # Do not create a netCDF dimension for the + # Do not create a dataset dimension for the # element dimension g["axis_to_ncdim"][ axis @@ -3803,7 +3852,7 @@ def _write_field_or_domain( # off any group structure from the name. ncdim = self._remove_group_structure(ncdim) - ncdim = self._netcdf_name(ncdim) + ncdim = self._name(ncdim) unlimited = self._unlimited(f, axis) self._write_dimension( @@ -3841,7 +3890,7 @@ def _write_field_or_domain( # Compression by gathering # # Write the list variable to the file, making a note - # of the netCDF sample dimension. + # of the dataset sample dimension. # ---------------------------------------------------- list_variable = self.implementation.get_list(f) compress = " ".join(compressed_ncdims) @@ -3854,7 +3903,7 @@ def _write_field_or_domain( # Compression by contiguous ragged array # # Write the count variable to the file, making a note - # of the netCDF sample dimension. + # of the dataset sample dimension. # ---------------------------------------------------- count = self.implementation.get_count(f) sample_ncdim = self._write_count_variable( @@ -3866,7 +3915,7 @@ def _write_field_or_domain( # Compression by indexed ragged array # # Write the index variable to the file, making a note - # of the netCDF sample dimension. + # of the dataset sample dimension. # ---------------------------------------------------- index = self.implementation.get_index(f) index_ncdim = self.implementation.nc_get_dimension( @@ -3891,7 +3940,7 @@ def _write_field_or_domain( # Compression by indexed contigous ragged array # # Write the index variable to the file, making a note - # of the netCDF sample dimension. + # of the dataset sample dimension. # ---------------------------------------------------- count = self.implementation.get_count(f) count_ncdim = self.implementation.nc_get_dimension( @@ -3978,7 +4027,7 @@ def _write_field_or_domain( ) # ------------------------------------------------------------ - # Create netCDF variables from domain ancillaries + # Create dataset variables from domain ancillaries # ------------------------------------------------------------ for key, anc in sorted( self.implementation.get_domain_ancillaries(f).items() @@ -3986,7 +4035,7 @@ def _write_field_or_domain( self._write_domain_ancillary(f, key, anc) # ------------------------------------------------------------ - # Create netCDF variables from cell measures + # Create dataset variables from cell measures # ------------------------------------------------------------ # Set the list of 'cell_measures' attribute values (each of # the form 'measure: name') @@ -3998,7 +4047,7 @@ def _write_field_or_domain( ] # ------------------------------------------------------------ - # Create netCDF formula_terms attributes from vertical + # Create formula_terms dataset attributes from vertical # coordinate references # ------------------------------------------------------------ for ref in g["formula_terms_refs"]: @@ -4072,8 +4121,9 @@ def _write_field_or_domain( if id(domain_anc) not in seen: continue - # Get the netCDF variable name for the domain - # ancillary and add it to the formula_terms attribute + # Get the dataset variable name for the domain + # ancillary and add it to the formula_terms + # attribute ncvar = seen[id(domain_anc)]["ncvar"] formula_terms.append(f"{term}: {ncvar}") @@ -4100,13 +4150,15 @@ def _write_field_or_domain( # g["nc"][ncvar].setncattr( # "formula_terms", formula_terms # ) - self._aaa(ncvar, {"formula_terms": formula_terms}) + self._set_attributes( + {"formula_terms": formula_terms}, ncvar + ) except KeyError: pass # TODO convert to 'raise' via fixes upstream logger.info( - " Writing formula_terms attribute to " - f"netCDF variable {ncvar}: {formula_terms!r}" + " Writing formula_terms attribute to variable " + f"{ncvar}: {formula_terms!r}" ) # pragma: no cover # Add the formula_terms attribute to the parent @@ -4120,12 +4172,15 @@ def _write_field_or_domain( # g["nc"][bounds_ncvar].setncattr( # "formula_terms", bounds_formula_terms # ) - self._aaa(bounds_ncvar, {"formula_terms": bounds_formula_terms}) + self._set_attributes( + {"formula_terms": bounds_formula_terms}, + bounds_ncvar + ) except KeyError: pass # TODO convert to 'raise' via fixes upstream logger.info( - " Writing formula_terms to netCDF bounds variable " + " Writing formula_terms to bounds variable " f"{bounds_ncvar}: {bounds_formula_terms!r}" ) # pragma: no cover @@ -4134,7 +4189,7 @@ def _write_field_or_domain( self._create_vertical_datum(ref, owning_coord_key) # ------------------------------------------------------------ - # Create netCDF variables grid mappings + # Create dataset grid mapping variables # ------------------------------------------------------------ multiple_grid_mappings = len(g["grid_mapping_refs"]) > 1 @@ -4146,8 +4201,8 @@ def _write_field_or_domain( # ------------------------------------------------------------ # Field ancillary variables # - # Create the 'ancillary_variables' CF-netCDF attribute and - # create the referenced CF-netCDF ancillary variables + # Create the 'ancillary_variables' CF attribute and create the + # referenced dataset ancillary variables # ------------------------------------------------------------ if field: ancillary_variables = [ @@ -4158,14 +4213,14 @@ def _write_field_or_domain( ] # ------------------------------------------------------------ - # Create the CF-netCDF data/domain variable + # Create the data/domain dataset variable # ------------------------------------------------------------ if field: default = "data" else: default = "domain" - ncvar = self._create_netcdf_variable_name(f, default=default) + ncvar = self._create_variable_name(f, default=default) ncdimensions = data_ncdimensions @@ -4176,7 +4231,7 @@ def _write_field_or_domain( cell_measures = " ".join(cell_measures) logger.info( " Writing cell_measures attribute to " - f"netCDF variable {ncvar}: {cell_measures!r}" + f"variable {ncvar}: {cell_measures!r}" ) # pragma: no cover extra["cell_measures"] = cell_measures @@ -4186,7 +4241,7 @@ def _write_field_or_domain( coordinates = " ".join(coordinates) logger.info( " Writing coordinates attribute to " - f"netCDF variable {ncvar}: {coordinates!r}" + f"variable {ncvar}: {coordinates!r}" ) # pragma: no cover extra["coordinates"] = coordinates @@ -4196,7 +4251,7 @@ def _write_field_or_domain( grid_mapping = " ".join(grid_mapping) logger.info( " Writing grid_mapping attribute to " - f"netCDF variable {ncvar}: {grid_mapping!r}" + f"variable {ncvar}: {grid_mapping!r}" ) # pragma: no cover extra["grid_mapping"] = grid_mapping @@ -4207,7 +4262,7 @@ def _write_field_or_domain( ancillary_variables = re.sub(r"\s+", " ", ancillary_variables) logger.info( " Writing ancillary_variables attribute to " - f"netCDF variable {ncvar}: {ancillary_variables!r}" + f"variable {ncvar}: {ancillary_variables!r}" ) # pragma: no cover extra["ancillary_variables"] = ancillary_variables @@ -4244,7 +4299,7 @@ def _write_field_or_domain( cell_methods = " ".join(cell_methods_strings) logger.info( " Writing cell_methods attribute to " - f"netCDF variable {ncvar}: {cell_methods}" + f"variable {ncvar}: {cell_methods}" ) # pragma: no cover extra["cell_methods"] = cell_methods @@ -4261,7 +4316,7 @@ def _write_field_or_domain( extra["geometry"] = gc_ncvar # ------------------------------------------------------------ - # Create a new CF-netCDF data/domain variable + # Create a new data/domain dataset variable # ------------------------------------------------------------ # Omit any global attributes from the variable omit = g["global_attributes"] @@ -4396,26 +4451,31 @@ def _unlimited(self, field, axis): """ return self.implementation.nc_is_unlimited_axis(field, axis) - def _write_group(self, parent_group, group_name): - """Creates a new netCDF4 parent group object. - - .. versionadded:: (cfdm) 1.8.6.0 - - :Parameters: - - parent_group: `netCDF4.Dateset` or `netCDF4._netCDF4.Group` - - group_name: `str` - - :Returns: - - `netCDF4._netCDF4.Group` - - """ - return parent_group.createGroup(group_name) + #def _write_group(self, parent_group, group_name): + # """Creates a new parent group object. + # + # .. versionadded:: (cfdm) 1.8.6.0 + # + # :Parameters: + # + # parent_group: `netCDF4.Dateset` or `netCDF4.Group` or `Zarr.Group` + # + # group_name: `str` + # + # :Returns: + # + # `netCDF4.Group` or `zarr.Group` + # + # """ + # backend = self.write_vars['backend'] + # if backend == 'netCDF4': + # return parent_group.createGroup(group_name) + # + # if backend == 'zarr': + # return parent_group.create_group(group_name) def _write_group_attributes(self, fields): - """Writes the netCDF group-level attributes to the file. + """Writes the group-level attributes to the file. :Parameters: @@ -4475,25 +4535,26 @@ def _write_group_attributes(self, fields): f0, attr ) - nc = g["netcdf"] + nc = g["dataset"] # TODOZARR for group in groups: if group in nc.groups: nc = nc.groups[group] else: - nc = self._create_netcdf_group(nc, group) + nc = self._createGroup(nc, group) if not g["dry_run"]: - nc.setncatts(this_group_attributes) + # nc.setncatts(this_group_attributes) + self._set_attributes(this_group_attributes, group=nc) group_attributes[groups] = tuple(this_group_attributes) g["group_attributes"] = group_attributes def _write_global_attributes(self, fields): - """Writes all netCDF global properties to the netCDF4 dataset. + """Writes all global properties to the dataset. - Specifically, finds the netCDF global properties from all of - the input fields and writes them to the `netCDF4.Dataset`. + Specifically, finds the global properties from all of the + input fields and writes them to the root group of the dataset. :Parameters: @@ -4628,34 +4689,45 @@ def _write_global_attributes(self, fields): delimiter = "," if not g["dry_run"] and not g["post_dry_run"]: - g["netcdf"].setncattr( - "Conventions", delimiter.join(g["Conventions"]) - ) + attrs = {"Conventions", delimiter.join(g["Conventions"])} +# g["dataset"].setncattr( +# "Conventions", delimiter.join(g["Conventions"]) +# ) # ------------------------------------------------------------ # Write the file descriptors to the file # ------------------------------------------------------------ - for attr, value in g["file_descriptors"].items(): - g["netcdf"].setncattr(attr, value) + attrs.update(g["file_descriptors"]) +# for attr, value in g["file_descriptors"].items(): +# g["dataset"].setncattr(attr, value) # ------------------------------------------------------------ # Write other global attributes to the file # ------------------------------------------------------------ - for attr in global_attributes - set(("Conventions",)): - g["netcdf"].setncattr( - attr, self.implementation.get_property(f0, attr) - ) + attrs.update( + { + attr: self.implementation.get_property(f0, attr) + for attr in global_attributes - set(("Conventions",)) + } + ) + # for attr in global_attributes - set(("Conventions",)): +# g["dataset"].setncattr( +# attr, self.implementation.get_property(f0, attr) +# ) # ------------------------------------------------------------ # Write "forced" global attributes to the file # ------------------------------------------------------------ - for attr, v in force_global.items(): - g["netcdf"].setncattr(attr, v) + attrs.update(force_global) + + self._set_attributes(attrs, group=g["dataset"]) +# for attr, v in force_global.items(): +# g["dataset"].setncattr(attr, v) g["global_attributes"] = global_attributes - def file_close(self, filename): - """Close the netCDF file that has been written. + def dataset_close(self, filename): + """Close the dataset that has been written. .. versionadded:: (cfdm) 1.7.0 @@ -4664,10 +4736,12 @@ def file_close(self, filename): `None` """ - self.write_vars["netcdf"].close() + g = self.write_vars + if g["backend"] == "netCDF4": + g["dataset"].close() - def file_open(self, filename, mode, fmt, fields): - """Open the netCDF file for writing. + def dataset_open(self, filename, mode, fmt, fields): + """Open the dataset for writing. .. versionadded:: (cfdm) 1.7.0 @@ -4694,8 +4768,7 @@ def file_open(self, filename, mode, fmt, fields): :Returns: - `netCDF.Dataset` - A `netCDF4.Dataset` object for the file. + `netCDF.Dataset` or `zarr.Group` """ if fields and mode == "w": @@ -4707,17 +4780,25 @@ def file_open(self, filename, mode, fmt, fields): f"data that needs to be read: {f!r} uses {filename}" ) - # mode == 'w' is safer than != 'a' in case of a typo (the letters - # are neighbours on a QWERTY keyboard) since 'w' is destructive. - # Note that for append ('a') mode the original file is never wiped. - if mode == "w" and self.write_vars["overwrite"]: - os.remove(filename) - - try: - nc = netCDF4.Dataset(filename, mode, format=fmt) - except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") + g = self.write_vars + match g['backend']: + case 'netCDF4': + # mode == 'w' is safer than != 'a' in case of a typo + # (the letters are neighbours on a QWERTY keyboard) + # since 'w' is destructive. Note that for append + # ('a') mode the original file is never wiped. + if mode == "w" and g["overwrite"]: + os.remove(filename) + + try: + nc = netCDF4.Dataset(filename, mode, format=fmt) + except RuntimeError as error: + raise RuntimeError(f"{error}: {filename}") + case 'zarr' + nc = zarr.group( + filename, overwrite=g["overwrite"], zarr_format=3) + return nc @_manage_log_level_via_verbosity @@ -5119,7 +5200,7 @@ def write( # The directory of the aggregation file "aggregation_file_directory": None, # Cache the CF aggregation variable write status for each - # netCDF variable + # dataset variable "cfa_write_status": {}, # -------------------------------------------------------- # Dataset chunking stategy @@ -5127,7 +5208,7 @@ def write( "dataset_chunks": dataset_chunks, # -------------------------------------------------------- # Quantization: Store unique Quantization objects, keyed - # by their output netCDF variable names. + # by their output dataset variable names. # -------------------------------------------------------- "quantization": {}, } @@ -5409,7 +5490,7 @@ def _file_io_iteration( if "Conventions" in variable_attributes: raise ValueError( "Can't prevent the 'Conventions' property from being " - f"a netCDF global variable: {variable_attributes}" + f"a CF global variable: {variable_attributes}" ) if global_attributes: @@ -5458,8 +5539,10 @@ def _file_io_iteration( g["least_significant_digit"] = least_significant_digit g["fmt"] = fmt - g["zarr"] = fmt == "ZARR" - g['netCDF4'] = not g["zarr"] + if fmt == "ZARR": + g['backend'] = 'zarr' + else: + g['backend'] = 'netCDF4' if isinstance( fields, @@ -5486,7 +5569,7 @@ def _file_io_iteration( g["overwrite"] = overwrite # ------------------------------------------------------------ - # Open the output netCDF file + # Open the output dataset # ------------------------------------------------------------ if os.path.isfile(filename): if mode == "w" and not overwrite: @@ -5504,7 +5587,7 @@ def _file_io_iteration( g["overwrite"] = False g["filename"] = filename - g["netcdf"] = self.file_open(filename, mode, fmt, fields) + g["dataset"] = self.dataset_open(filename, mode, fmt, fields) if not g["dry_run"]: # -------------------------------------------------------- @@ -5553,7 +5636,7 @@ def _file_io_iteration( # For append mode, it is cleaner code-wise to close the file # on the read iteration and re-open it for the append # iteration. So we always close it here. - self.file_close(filename) + self.dataset_close(filename) # ------------------------------------------------------------ # Write external fields to the external file @@ -5608,7 +5691,7 @@ def _int32(self, array): return array def _dimension_in_subgroup(self, v, ncdim): - """Return True if the netCDF dimension is in a valid group. + """Return True if the dimension is in a valid group. Returns True if the dimension is in the same group, or a parent group, as the group defined by the construct. Otherwise @@ -5621,7 +5704,7 @@ def _dimension_in_subgroup(self, v, ncdim): v: Construct ncdim: `str` - The netCDF dimension name. + The dataset dimension name. *Parameter example:* ``'lat'`` @@ -5632,8 +5715,7 @@ def _dimension_in_subgroup(self, v, ncdim): :Returns: `bool` - Whether or not the netCDF dimension is in a valid - group. + Whether or not the dimension is in a valid group. """ v_groups = self.implementation.nc_get_variable_groups(v) @@ -5655,7 +5737,7 @@ def _customise_write_vars(self): pass def _chunking_parameters(self, data, ncdimensions): - """Set chunking parameters for `netCDF4.createVariable`. + """Set chunking parameters for a dataset variable. .. versionadded:: (cfdm) 1.11.2.0 @@ -5665,13 +5747,13 @@ def _chunking_parameters(self, data, ncdimensions): The data being written. ncdimensions: `tuple` - The data netCDF dimensions. + The dataset dimensions of the data. :Returns: 2-tuple The *contiguous* and *chunksizes* parameters for - `netCDF4.createVariable`. + `_createVariable`. """ if data is None: @@ -5687,6 +5769,10 @@ def _chunking_parameters(self, data, ncdimensions): chunksizes = self.implementation.nc_get_dataset_chunksizes(data) if chunksizes == "contiguous": # Contiguous as defined by 'data' + if g['zarr']: + # Return a single chunk + return False, self._shape_in_dataset(data, ncdimensions) + return True, None # Still here? @@ -5702,6 +5788,10 @@ def _chunking_parameters(self, data, ncdimensions): # dataset_chunks if dataset_chunks == "contiguous": # Contiguous as defined by 'dataset_chunks' + if g['zarr']: + # Return a single chunk + return False, self._shape_in_dataset(data, ncdimensions) + return True, None # Still here? Then work out the chunks from both the @@ -5731,6 +5821,15 @@ def _chunking_parameters(self, data, ncdimensions): # data contiguously. return True, None + def _shape_in_dataset(self, data, ncdimensions): + """TODOZARR""" + if self._compressed_data(ncdimensions): + d = self.implementation.get_compressed_array(data) + else: + d = data + + return d.shape + def _compressed_data(self, ncdimensions): """Whether or not the data is being written in compressed form. @@ -5739,7 +5838,7 @@ def _compressed_data(self, ncdimensions): :Parameters: ncdimensions: `sequence` of `str` - The ordered netCDF dimension names of the data. These + The ordered dataset dimension names of the data. These are the dimensions going into the file, and if the data is compressed will differ from the dimensions implied by the data in memory. @@ -5872,7 +5971,7 @@ def _cfa_write_status(self, ncvar, cfvar, construct_type, domain_axes): return False def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): - """Write an aggregation variable to the netCDF file. + """Write an aggregation variable to the dataset. .. versionadded:: (cfdm) 1.12.0.0 @@ -5883,7 +5982,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): data. ncvar: `str` - The netCDF name for the variable. + The dataset xname for the variable. ncdimensions: sequence of `str` @@ -5899,7 +5998,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): g = self.write_vars # ------------------------------------------------------------ - # Write the fragment array variables to the netCDF file + # Write the fragment array variables to the dataset # ------------------------------------------------------------ aggregated_data = data.nc_get_aggregated_data() aggregated_data_attr = [] @@ -5963,7 +6062,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): chunking = None - # Get the fragment array netCDF dimensions from the + # Get the fragment array dataset dimensions from the # 'location' fragment array variable. location_ncdimensions = [] for ncdim, size in zip(ncdimensions, f_uris.shape): @@ -6025,8 +6124,8 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): feature = "unique_values" f_unique_value = cfa[feature] - # Get the fragment array netCDF dimensions from the - # 'value' fragment array variable. + # Get the fragment array dimensions from the 'value' + # fragment array variable. unique_value_ncdimensions = [] for ncdim, size in zip(ncdimensions, f_unique_value.shape): cfa_ncdim = f"a_{ncdim}" @@ -6038,7 +6137,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): unique_value_ncdimensions = tuple(unique_value_ncdimensions) - # Write the fragment array variable to the netCDF dataset + # Write the fragment array variable to the dataset feature_ncvar = self._cfa_write_fragment_array_variable( f_unique_value, aggregated_data.get(feature, f"fragment_{feature}"), @@ -6049,7 +6148,7 @@ def _cfa_create_data(self, cfa, ncvar, ncdimensions, data, cfvar): # ------------------------------------------------------------ # Add the aggregation variable attributes # ------------------------------------------------------------ - self._write_attributes( + self._write_variable_attributes( None, ncvar, extra={ @@ -6097,25 +6196,24 @@ def _cfa_write_fragment_array_variable( The data to write. ncvar: `str` - The netCDF variable name. + The dataset variable name. ncdimensions: `tuple` of `str` - The fragment array variable's netCDF dimensions. + The fragment array variable's dataset dimensions. attributes: `dict`, optional Any attributes to attach to the variable. chunking: sequence, optional - Set `netCDF4.createVariable` 'contiguous' and - `chunksizes` parameters (in that order) for the - fragment array variable. If not set (the default), - then these parameters are inferred from the data. + Set `_createVariable` 'contiguous' and `chunksizes` + parameters (in that order) for the fragment array + variable. If not set (the default), then these + parameters are inferred from the data. :Returns: `str` - The netCDF variable name of the fragment array - variable. + The name of the fragment array dataset variable. """ create = not self._already_in_file(data, ncdimensions) @@ -6123,7 +6221,7 @@ def _cfa_write_fragment_array_variable( if create: # Create a new fragment array variable in the file, with # 'contiguous' chunking - ncvar = self._netcdf_name(ncvar) + ncvar = self._name(ncvar) self._write_netcdf_variable( ncvar, ncdimensions, @@ -6211,7 +6309,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): if not data.nc_get_aggregation_write_status(): raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF aggregation variable. " + f"Can't write {cfvar!r} as a CF aggregation variable. " "This is probably because some fragment values have been " "changed relative to those in the fragment files, or a " "rechunking has occured." @@ -6288,7 +6386,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): # This Dask chunk's data is not a reference to # fragment file raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF " + f"Can't write {cfvar!r} as a CF " "aggregation variable: " f"The Dask chunk in position {position} " f"(defined by data index {index!r}) does not " @@ -6302,7 +6400,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): # This Dask chunk's data is a reference to # fragment file, but only to a subspace of it. raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF " + f"Can't write {cfvar!r} as a CF " "aggregation variable: " f"The Dask chunk in position {position} " f"(defined by data index {index!r}) references " @@ -6330,7 +6428,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): if scheme != aggregation_file_scheme: raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF " + f"Can't write {cfvar!r} as a CF " "aggregation variable: " "Attempting to create a relative-path URI " f"reference for the fragment file {fragment}, " @@ -6385,7 +6483,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): d.persist(inplace=True) except AggregationError as error: raise AggregationError( - f"Can't write {cfvar!r} as a CF-netCDF aggregation " + f"Can't write {cfvar!r} as a CF aggregation " "variable. " "At least one Dask chunk has more than one unique value: " f"{error}. " @@ -6399,7 +6497,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): return out def _write_quantization_container(self, quantization): - """Write a CF-netCDF quantization container variable. + """Write a CF quantization container variable. .. note:: It is assumed, but not checked, that the per-variable parameters (such as "quantization_nsd" @@ -6416,7 +6514,7 @@ def _write_quantization_container(self, quantization): :Returns: `str` - The netCDF variable name for the quantization + The dataset variable name for the quantization container. """ @@ -6428,12 +6526,12 @@ def _write_quantization_container(self, quantization): return ncvar # Create a new quantization container variable - ncvar = self._create_netcdf_variable_name( + ncvar = self._create_variable_name( quantization, default="quantization" ) logger.info( - f" Writing {quantization!r} to netCDF variable: {ncvar}" + f" Writing {quantization!r} to variable: {ncvar}" ) # pragma: no cover kwargs = { @@ -6453,7 +6551,9 @@ def _write_quantization_container(self, quantization): # self.implementation.parameters(quantization) # ) # TODOZARR - self._aaa(ncvar, self.implementation.parameters(quantization)) + self._set_attributes( + self.implementation.parameters(quantization), ncvar + ) # Update the quantization dictionary g["quantization"][ncvar] = quantization diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 85f2df22d..2cab8d468 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -587,7 +587,7 @@ class write(ReadWrite): number of bytes (taken either from the *dataset_chunks* parameter, or as stored by the data itself), "square-like" dataset chunk shapes - are preferred that maximise the amount of chunks + are preferred that maximise the number of chunks that are completely filled with data values. For example, with *dataset_chunks* of ``'4 MiB'``, a data array of 64-bit floats with shape (400, From e1d4c97dddbcf979c234223006de811caa3dbcac Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 23 Aug 2025 11:50:00 +0100 Subject: [PATCH 03/39] first successful write --- cfdm/read_write/abstract/abstractio.py | 4 +- cfdm/read_write/netcdf/constants.py | 3 + cfdm/read_write/netcdf/netcdfwrite.py | 146 ++++++++++++++++--------- cfdm/read_write/write.py | 14 +-- 4 files changed, 107 insertions(+), 60 deletions(-) diff --git a/cfdm/read_write/abstract/abstractio.py b/cfdm/read_write/abstract/abstractio.py index ae1cde4d7..c16c5f244 100644 --- a/cfdm/read_write/abstract/abstractio.py +++ b/cfdm/read_write/abstract/abstractio.py @@ -16,12 +16,12 @@ def __init__(self, implementation): self.implementation = implementation @abc.abstractmethod - def file_close(self, *args, **kwargs): + def dataset_close(self, *args, **kwargs): """Close the dataset file.""" raise NotImplementedError() # pragma: no cover @abc.abstractmethod - def file_open(self, *args, **kwargs): + def dataset_open(self, *args, **kwargs): """Open the dataset file.""" raise NotImplementedError() # pragma: no cover diff --git a/cfdm/read_write/netcdf/constants.py b/cfdm/read_write/netcdf/constants.py index e29b04dce..6bde9eaf9 100644 --- a/cfdm/read_write/netcdf/constants.py +++ b/cfdm/read_write/netcdf/constants.py @@ -67,6 +67,9 @@ # NetCDF-4 file formats NETCDF4_FMTS = ("NETCDF4", "NETCDF4_CLASSIC") +# Zarr dataset formats +ZARR_FMTS = ("ZARR3",) + # -------------------------------------------------------------------- # Quantization # -------------------------------------------------------------------- diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 3140ab31b..0e1b207c9 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -6,6 +6,8 @@ import dask.array as da import netCDF4 import numpy as np +import zarr + from dask import config as dask_config from dask.array.core import normalize_chunks from dask.utils import parse_bytes @@ -22,7 +24,7 @@ NETCDF3_FMTS, NETCDF4_FMTS, NETCDF_QUANTIZATION_PARAMETERS, - NETCDF_QUANTIZE_MODES, + NETCDF_QUANTIZE_MODES, ZARR_FMTS ) from .netcdfread import NetCDFRead @@ -323,6 +325,7 @@ def _set_attributes(self, attributes, ncvar=None, group=None): case "netCDF": x.setncatts(attributes) case "zarr": + print('ATTR=', attributes) x.update_attributes(attributes) def _character_array(self, array): @@ -474,7 +477,7 @@ def _string_length_dimension(self, size): if not g["dry_run"]: try: -# parent_group.createDimension(ncdim, size) +# parent_group.createDimension(ncdim, size) TODOZARR self._createDimension(parent_group, ncdim, size) except RuntimeError: pass # TODO convert to 'raise' via fixes upstream @@ -485,12 +488,11 @@ def _createDimension(self, group, ncdim, size): """TODOZARR """ - match self.write_vars['backend']: - case 'netCDF4': + match self.write_vars["backend"]: + case "netCDF4": group.createDimension(ncdim, size) - case 'zarr': - # Dimensions to not need to be created in Zarr - # datasets + case "zarr": + # Dimensions are not created in Zarr datasets pass def _dataset_dimensions(self, field, key, construct): @@ -625,10 +627,6 @@ def _write_dimension( """ g = self.write_vars - if g['backend'] == 'zarr': - # Dimensions don't get written to Zarr datasets - return - if axis is not None: domain_axis = self.implementation.get_domain_axes(f)[axis] logger.info( @@ -649,7 +647,8 @@ def _write_dimension( # its name with its basename (CF>=1.8) ncdim = self._remove_group_structure(ncdim) - if not g["dry_run"]: + # Dimensions don't get written to Zarr datasets + if not (g["dry_run"] or g['backend'] == 'zarr'): if unlimited: # Create an unlimited dimension size = None @@ -1372,7 +1371,8 @@ def _write_bounds( if not g["dry_run"]: try: - parent_group.createDimension(base_bounds_ncdim, size) + self._createDimension(parent_group, base_bounds_ncdim, size) +# parent_group.createDimension(base_bounds_ncdim, size) except RuntimeError: raise @@ -1557,7 +1557,8 @@ def _write_node_coordinates( ncdim = self._remove_group_structure(ncdim) if not g["dry_run"]: - parent_group.createDimension(ncdim, size) + # parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) # Set an appropriate default node coordinates dataset # variable name @@ -1982,7 +1983,8 @@ def _write_part_node_count(self, f, coord, bounds, encodings): ncdim = self._remove_group_structure(ncdim) if not g["dry_run"]: - parent_group.createDimension(ncdim, size) + # parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) ncvar = self._name(ncvar) @@ -2069,7 +2071,8 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ncdim = self._remove_group_structure(ncdim) if not g["dry_run"]: - parent_group.createDimension(ncdim, size) + # parent_group.createDimension(ncdim, size) + self._createDimension(parent_group, ncdim, size) ncvar = self._name(ncvar) @@ -2545,25 +2548,52 @@ def _createVariable(self, **kwargs): g = self.write_vars ncvar = kwargs["varname"] - match g["backend"] + match g["backend"]: case "netCDF4": netcdf4_kwargs = kwargs # Remove Zarr-specific kwargs netcdf4_kwargs.pop('shape', None) - netcdf4_kwargs.pop('shards', None) + netcdf4_kwargs.pop('shards', None) + + if kwargs["contiguous"]: + if g["dataset"].data_model.startswith("NETCDF4"): + # NETCDF4 contiguous variables can't be compressed + kwargs["compression"] = None + kwargs["complevel"] = 0 + + # NETCDF4 contiguous variables can't span unlimited + # dimensions + unlimited_dimensions = ( + g["unlimited_dimensions"].intersection( + kwargs.get("dimensions", ()) + ) + ) + if unlimited_dimensions: + data_model = g["dataset"].data_model + raise ValueError( + f"Can't create variable {ncvar!r} in " + f"{data_model} dataset from {cfvar!r}: " + f"In {data_model} it is not allowed to write " + "contiguous (as opposed to chunked) data " + "that spans one or more unlimited dimensions: " + f"{unlimited_dimensions}" + ) + variable = g["dataset"].createVariable(**netcdf4_kwargs) case "zarr": + print ('kwargs = ', kwargs) zarr_kwargs = {"name": ncvar, "shape": kwargs.get('shape', ()), "dtype": kwargs['datatype'], "chunks": kwargs.get('chunks', 'auto'), "shards": kwargs.get('shards'), - "compressors": ???, + "compressors": None, # TODOZARR "fill_value": kwargs.get("fill_value"), - "dimension_names": kwargs["dimensions"], + "dimension_names": kwargs.get('dimensions', ()), "overwrite": True, } + print ('zarr_kwargs = ', zarr_kwargs) variable = g["dataset"].create_array(**zarr_kwargs) g["nc"][ncvar] = variable @@ -2781,7 +2811,6 @@ def _write_netcdf_variable( data, ncdimensions = self._transform_strings( data, ncdimensions, - # cfvar, data, ncdimensions ) # Whether or not to write the data @@ -2842,6 +2871,8 @@ def _write_netcdf_variable( self._remove_group_structure(ncdim) for ncdim in ncdimensions ] + # Get shape of arra + # ------------------------------------------------------------ # Create a new dataset variable # ------------------------------------------------------------ @@ -2857,6 +2888,16 @@ def _write_netcdf_variable( "chunk_cache": g["chunk_cache"], } + if data is not None: + compressed=self._compressed_data(ncdimensions) + if compressed: + # Write data in its compressed form + shape = data.source().source().shape + else: + shape = data.shape + + kwargs['shape'] = shape + # ------------------------------------------------------------ # Create a quantization container variable, add any extra # quantization attributes, and if required instruct @@ -3003,9 +3044,12 @@ def _write_netcdf_variable( # dimensions and dataset chunking strategy will # otherwise reflect the aggregated data in memory, # rather than the scalar variable in the file. - kwargs["dimensions"] = () - kwargs["contiguous"] = True - kwargs["chunksizes"] = None + kwargs["shape"] = () # zarr + kwargs["chunks"] = 'auto' # zarr + kwargs["shards"] = None # zarr + kwargs["dimensions"] = () # netCDF4 + zarr + kwargs["contiguous"] = True # netCDF4 + kwargs["chunksizes"] = None # netCDF4 # Add compression parameters (but not for scalars or vlen # strings). @@ -3030,26 +3074,27 @@ def _write_netcdf_variable( ) # pragma: no cover # Adjust createVariable arguments for contiguous variables - if kwargs["contiguous"]: - if g["dataset"].data_model.startswith("NETCDF4"): - # NETCDF4 contiguous variables can't span unlimited - # dimensions - unlimited_dimensions = g["unlimited_dimensions"].intersection( - kwargs["dimensions"] - ) - if unlimited_dimensions: - data_model = g["dataset"].data_model - raise ValueError( - f"Can't create variable {ncvar!r} in {data_model} " - f"file from {cfvar!r}: In {data_model} it is not " - "allowed to write contiguous (as opposed to chunked) " - "data that spans one or more unlimited dimensions: " - f"{unlimited_dimensions}" - ) - - # NETCDF4 contiguous variables can't be compressed - kwargs["compression"] = None - kwargs["complevel"] = 0 + # TODOZARR - moved to `_createVariable` + # if kwargs["contiguous"]: + # if g["dataset"].data_model.startswith("NETCDF4"): + # # NETCDF4 contiguous variables can't span unlimited + # # dimensions + # unlimited_dimensions = g["unlimited_dimensions"].intersection( + # kwargs["dimensions"] + # ) + # if unlimited_dimensions: + # data_model = g["dataset"].data_model + # raise ValueError( + # f"Can't create variable {ncvar!r} in {data_model} " + # f"file from {cfvar!r}: In {data_model} it is not " + # "allowed to write contiguous (as opposed to chunked) " + # "data that spans one or more unlimited dimensions: " + # f"{unlimited_dimensions}" + # ) + # + # # NETCDF4 contiguous variables can't be compressed + # kwargs["compression"] = None + # kwargs["complevel"] = 0 try: self._createVariable(**kwargs) @@ -3181,8 +3226,6 @@ def _transform_strings(self, data, ncdimensions): # TODOZARR - consider always writing string arrays in zarr (rather than char arrays) datatype = self._datatype(data) - - if data is not None and datatype == "S1": # -------------------------------------------------------- @@ -3195,6 +3238,7 @@ def _transform_strings(self, data, ncdimensions): array = self._numpy_compressed(array) strlen = len(max(array, key=len)) + del array data = self._convert_to_char(data) ncdim = self._string_length_dimension(strlen) @@ -4689,7 +4733,7 @@ def _write_global_attributes(self, fields): delimiter = "," if not g["dry_run"] and not g["post_dry_run"]: - attrs = {"Conventions", delimiter.join(g["Conventions"])} + attrs = {"Conventions": delimiter.join(g["Conventions"])} # g["dataset"].setncattr( # "Conventions", delimiter.join(g["Conventions"]) # ) @@ -4795,7 +4839,7 @@ def dataset_open(self, filename, mode, fmt, fields): except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") - case 'zarr' + case 'zarr': nc = zarr.group( filename, overwrite=g["overwrite"], zarr_format=3) @@ -5470,10 +5514,10 @@ def _file_io_iteration( if group: # Can't write groups to a netCDF-3 file g["group"] = False - elif fmt not in NETCDF4_FMTS: + elif fmt not in NETCDF4_FMTS + ZARR_FMTS: raise ValueError( f"Unknown output file format: {fmt!r}. " - f"Valid formats are {NETCDF4_FMTS + NETCDF3_FMTS}" + f"Valid formats are {NETCDF4_FMTS + NETCDF3_FMTS + ZARR_FMTS}" ) # ------------------------------------------------------------ @@ -5539,7 +5583,7 @@ def _file_io_iteration( g["least_significant_digit"] = least_significant_digit g["fmt"] = fmt - if fmt == "ZARR": + if fmt == "ZARR3": g['backend'] = 'zarr' else: g['backend'] = 'netCDF4' diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 2cab8d468..4893dc9bf 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -421,13 +421,13 @@ class write(ReadWrite): string: `bool`, optional By default string-valued construct data are written as netCDF arrays of type string if the output file format is - ``'NETCDF4'``, or of type char with an extra dimension - denoting the maximum string length for any other output - file format (see the *fmt* parameter). If *string* is False - then string-valued construct data are written as netCDF - arrays of type char with an extra dimension denoting the - maximum string length, regardless of the selected output - file format. + ``'NETCDF4'`` or ``'ZARR3'``, or of type char with an + extra dimension denoting the maximum string length for any + other output file format (see the *fmt* parameter). If + *string* is False then string-valued construct data are + written as netCDF arrays of type char with an extra + dimension denoting the maximum string length, regardless + of the selected output file format. .. versionadded:: (cfdm) 1.8.0 From d009cc6dbf0f4a1d0c47a3199e358828588cc68d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 24 Aug 2025 10:59:59 +0100 Subject: [PATCH 04/39] dev --- cfdm/read_write/abstract/abstractio.py | 8 +- cfdm/read_write/netcdf/netcdfread.py | 118 +++++---- cfdm/read_write/netcdf/netcdfwrite.py | 327 ++++++++++++------------- cfdm/read_write/write.py | 5 +- 4 files changed, 235 insertions(+), 223 deletions(-) diff --git a/cfdm/read_write/abstract/abstractio.py b/cfdm/read_write/abstract/abstractio.py index c16c5f244..ccfa6a433 100644 --- a/cfdm/read_write/abstract/abstractio.py +++ b/cfdm/read_write/abstract/abstractio.py @@ -17,12 +17,12 @@ def __init__(self, implementation): @abc.abstractmethod def dataset_close(self, *args, **kwargs): - """Close the dataset file.""" + """Close the dataset.""" raise NotImplementedError() # pragma: no cover @abc.abstractmethod def dataset_open(self, *args, **kwargs): - """Open the dataset file.""" + """Open the dataset.""" raise NotImplementedError() # pragma: no cover @@ -31,7 +31,7 @@ class IORead(IO, metaclass=abc.ABCMeta): @abc.abstractmethod def read(self, *args, **kwargs): - """Read fields from a netCDF file.""" + """Read fields from a netCDF dataset.""" raise NotImplementedError() # pragma: no cover @@ -40,5 +40,5 @@ class IOWrite(IO, metaclass=abc.ABCMeta): @abc.abstractmethod def write(self, *args, **kwargs): - """Write fields to a netCDF file.""" + """Write fields to a netCDF dataset.""" raise NotImplementedError() # pragma: no cover diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index d277be493..685687e3d 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -144,7 +144,7 @@ class NetCDFRead(IORead): "is not referenced in file": 9, "exists in the file": 10, "does not exist in file": 11, - "exists in multiple external files": 12, + "exists in multiple external datasets": 12, "has incorrect size": 13, "is missing": 14, "is not used by data variable": 15, @@ -451,11 +451,11 @@ def _reference(self, ncvar, referencing_ncvar): return count - def file_close(self): - """Close all netCDF files that have been opened. + def dataset_close(self): + """Close all netCDF datasets that have been opened. - Includes the input file being read, any external files, and any - temporary flattened files. + Includes the input dataset being read, any external datasets, + and any temporary flattened dataset. :Returns: @@ -463,16 +463,16 @@ def file_close(self): **Examples** - >>> r.file_close() + >>> r.dataset_close() """ g = self.read_vars - # Close temporary flattened files - for flat_file in g["flat_files"]: - flat_file.close() + # Close temporary flattened datasets + for flat_dataset in g["flat_datasets"]: + flat_dataset.close() - if g["file_opened_with"] == "zarr": + if g["dataset_opened_with"] == "zarr": # zarr return @@ -488,8 +488,8 @@ def file_close(self): for f in g["s3fs_File_objects"]: f.close() - def file_open(self, dataset, flatten=True, verbose=None): - """Open the netCDF file for reading. + def dataset_open(self, dataset, flatten=True, verbose=None): + """Open the netCDF dataset for reading. If the file has hierarchical groups then a flattened version of it is returned, and the original grouped file remains open. @@ -513,7 +513,7 @@ def file_open(self, dataset, flatten=True, verbose=None): **Examples** - >>> r.file_open('file.nc') + >>> r.dataset_open('file.nc') """ g = self.read_vars @@ -561,7 +561,7 @@ def file_open(self, dataset, flatten=True, verbose=None): ) # pragma: no cover # Map backend names to file-open functions - file_open_function = { + dataset_open_function = { "h5netcdf": self._open_h5netcdf, "netCDF4": self._open_netCDF4, "zarr": self._open_zarr, @@ -573,7 +573,7 @@ def file_open(self, dataset, flatten=True, verbose=None): errors = [] for backend in netcdf_backend: try: - nc = file_open_function[backend](dataset) + nc = dataset_open_function[backend](dataset) except KeyError: errors.append(f"{backend}: Unknown netCDF backend name") except Exception as error: @@ -600,7 +600,7 @@ def file_open(self, dataset, flatten=True, verbose=None): if flatten and self._dataset_has_groups(nc): # Create a diskless, non-persistent container for the # flattened file - flat_file = tempfile.NamedTemporaryFile( + flat_dataset = tempfile.NamedTemporaryFile( mode="wb", dir=tempfile.gettempdir(), prefix="cfdm_flat_", @@ -609,7 +609,7 @@ def file_open(self, dataset, flatten=True, verbose=None): ) flat_nc = netCDF4.Dataset( - flat_file, "w", diskless=True, persist=False + flat_dataset, "w", diskless=True, persist=False ) flat_nc.set_fill_off() @@ -625,10 +625,10 @@ def file_open(self, dataset, flatten=True, verbose=None): nc = flat_nc g["has_groups"] = True - g["flat_files"].append(flat_file) + g["flat_datasets"].append(flat_dataset) g["nc_opened_with"] = "netCDF4" else: - g["nc_opened_with"] = g["file_opened_with"] + g["nc_opened_with"] = g["dataset_opened_with"] g["nc"] = nc return nc @@ -649,7 +649,7 @@ def _open_netCDF4(self, filename): """ nc = netCDF4.Dataset(filename, "r") - self.read_vars["file_opened_with"] = "netCDF4" + self.read_vars["dataset_opened_with"] = "netCDF4" return nc def _open_h5netcdf(self, filename): @@ -683,7 +683,7 @@ def _open_h5netcdf(self, filename): rdcc_w0=0.75, rdcc_nslots=4133, ) - self.read_vars["file_opened_with"] = "h5netcdf" + self.read_vars["dataset_opened_with"] = "h5netcdf" return nc def _open_zarr(self, dataset): @@ -708,7 +708,7 @@ def _open_zarr(self, dataset): raise nc = zarr.open(dataset) - self.read_vars["file_opened_with"] = "zarr" + self.read_vars["dataset_opened_with"] = "zarr" return nc def cdl_to_netcdf(self, filename): @@ -1009,7 +1009,7 @@ def read( dask_chunks: `str`, `int`, `None`, or `dict`, optional Specify the `dask` chunking of dimensions for data in - the input files. See `cfdm.read` for details. + the input datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.11.2.0 @@ -1020,14 +1020,14 @@ def read( .. versionadded:: (cfdm) 1.12.0.0 cfa: `dict`, optional - Configure the reading of CF-netCDF aggregation files. - See `cfdm.read` for details. + Configure the reading of CF-netCDF aggregation + datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 cfa_write: sequence of `str`, optional - Configure the reading of CF-netCDF aggregation files. - See `cfdm.read` for details. + Configure the reading of CF-netCDF aggregation + datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 @@ -1051,8 +1051,8 @@ def read( .. versionadded:: (cfdm) 1.12.0.0 dataset_type: `None` or (sequence of) `str`, optional - Only read files of the given type(s). See `cfdm.read` - for details. + Only read datasets of the given type(s). See + `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 @@ -1122,7 +1122,7 @@ def read( # valid. # # Note that the `dataset_type` method is much faster than the - # `file_open` method at returning for unrecognised types. + # `dataset_open` method at returning for unrecognised types. # ------------------------------------------------------------ d_type = self.dataset_type(dataset, dataset_type) if not d_type: @@ -1373,8 +1373,8 @@ def read( # Assume a priori that the dataset does not have a group # structure "has_groups": False, - # Keep a list of flattened file names - "flat_files": [], + # Keep a list of flattened dataset names + "flat_datasets": [], # -------------------------------------------------------- # Domains (CF>=1.9) # -------------------------------------------------------- @@ -1471,10 +1471,10 @@ def read( g.update(deepcopy(extra_read_vars)) # ------------------------------------------------------------ - # Open the netCDF file to be read + # Open the netCDF dataset to be read # ------------------------------------------------------------ try: - nc = self.file_open(dataset, flatten=True, verbose=None) + nc = self.dataset_open(dataset, flatten=True, verbose=None) except DatasetTypeError: if not g["ignore_unknown_type"]: raise @@ -1573,7 +1573,7 @@ def read( dimension_isunlimited = {} # ------------------------------------------------------------ - # For grouped files (CF>=1.8) map: + # For grouped datasets (CF>=1.8) map: # # * each flattened variable name to its absolute path # * each flattened dimension name to its absolute path @@ -1799,10 +1799,10 @@ def read( g["variables"] = variables # The netCDF4 dataset objects that have been opened (i.e. the - # for parent file and any external files) + # for parent file and any external datasets) g["datasets"] = [nc] - # The names of the variable in the parent files + # The names of the variable in the parent datasetss # (i.e. excluding any external variables) g["internal_variables"] = set(variables) @@ -2148,7 +2148,7 @@ def read( if g["CF>=1.7"]: logger.info( f" External variables: {g['external_variables']}\n" - f" External files : {g['external_files']}" + f" External datasets : {g['external_files']}" ) # pragma: no cover if g["external_files"] and g["external_variables"]: @@ -2334,9 +2334,9 @@ def read( self._check_valid(f, c) # ------------------------------------------------------------ - # Close all opened netCDF files + # Close all opened netCDF datasets # ------------------------------------------------------------ - self.file_close() + self.dataset_close() # ------------------------------------------------------------ # Squeeze/unsqueeze size 1 axes in field constructs @@ -2539,7 +2539,7 @@ def _customise_read_vars(self): pass def _get_variables_from_external_files(self, netcdf_external_variables): - """Get external variables from external files. + """Get external variables from external datasets. ..versionadded:: (cfdm) 1.7.0 @@ -2585,7 +2585,7 @@ def _get_variables_from_external_files(self, netcdf_external_variables): for external_file in external_files: logger.info( - "\nScanning external file:\n-----------------------" + "\nScanning external datasets:\n---------------------------" ) # pragma: no cover # Note: We pass in the s3 file system (if any) of the @@ -2625,7 +2625,7 @@ def _get_variables_from_external_files(self, netcdf_external_variables): ncvar, message=( "External variable", - "exists in multiple external files", + "exists in multiple external datasets", ), attribute=attribute, ) @@ -6705,13 +6705,25 @@ def _create_netcdfarray( if return_kwargs_only: return kwargs - file_opened_with = g["file_opened_with"] - if file_opened_with == "netCDF4": - array = self.implementation.initialise_NetCDF4Array(**kwargs) - elif file_opened_with == "h5netcdf": - array = self.implementation.initialise_H5netcdfArray(**kwargs) - elif file_opened_with == "zarr": - array = self.implementation.initialise_ZarrArray(**kwargs) + # file_opened_with = g["file_opened_with"] + # if file_opened_with == "netCDF4": + # array = self.implementation.initialise_NetCDF4Array(**kwargs) + # elif file_opened_with == "h5netcdf": + # array = self.implementation.initialise_H5netcdfArray(**kwargs) + # elif file_opened_with == "zarr": + # array = self.implementation.initialise_ZarrArray(**kwargs) + + match g["dataset_opened_with"]: + case "netCDF4": + array = self.implementation.initialise_NetCDF4Array( + **kwargs + ) + case "h5netcdf": + array = self.implementation.initialise_H5netcdfArray( + **kwargs + ) + case "zarr": + array = self.implementation.initialise_ZarrArray(**kwargs) return array, kwargs @@ -10862,7 +10874,7 @@ def _dataset_has_groups(self, nc): `bool` """ - if self.read_vars["file_opened_with"] == "zarr": + if self.read_vars["dataset_opened_with"] == "zarr": # zarr if len(tuple(nc.groups())) > 1: raise ReadError( @@ -11093,7 +11105,7 @@ def _file_variable_attributes(self, var): # netCDF4 return {attr: var.getncattr(attr) for attr in var.ncattrs()} else: - if self.read_vars["file_opened_with"] == "zarr": + if self.read_vars["dataset_opened_with"] == "zarr": # zarr: Remove the _ARRAY_DIMENSIONS attribute attrs.pop("_ARRAY_DIMENSIONS", None) @@ -11149,7 +11161,7 @@ def _file_variable_size(self, var): """ # Use try/except here because the variable type could differ # from that implied by the value of - # read_vars["file_opened_with"] + # read_vars["dataset_opened_with"] try: # netCDF4, zarr return var.size diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 0e1b207c9..8a3c97757 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -7,7 +7,6 @@ import netCDF4 import numpy as np import zarr - from dask import config as dask_config from dask.array.core import normalize_chunks from dask.utils import parse_bytes @@ -24,7 +23,8 @@ NETCDF3_FMTS, NETCDF4_FMTS, NETCDF_QUANTIZATION_PARAMETERS, - NETCDF_QUANTIZE_MODES, ZARR_FMTS + NETCDF_QUANTIZE_MODES, + ZARR_FMTS, ) from .netcdfread import NetCDFRead @@ -42,10 +42,9 @@ class AggregationError(Exception): class NetCDFWrite(IOWrite): - """A container for writing Fields to a dataset. + """A container for writing Fields to a netCDF dataset. - Both netCDF and Zarr output formats are supported (despite the - name of the class!). + NetCDF3, netCDF4 and Zarr output formats are supported. """ @@ -88,7 +87,8 @@ def _createGroup(self, parent, group_name): :Parameters: parent: `netCDF4.Dateset` or `netCDF4.Group` or `Zarr.Group` - + The group in which to create the new group. + group_name: `str` The name of the group. @@ -98,12 +98,12 @@ def _createGroup(self, parent, group_name): The new group object. """ - backend = self.write_vars['backend'] - if backend == 'netCDF4': - return parent.createGroup(group_name) - - if backend == 'zarr': - return parent.create_group(group_name) + match self.write_vars["backend"]: + case "netCDF4": + return parent.createGroup(group_name) + + case "zarr": + return parent.create_group(group_name) def _create_variable_name(self, parent, default): """Create an appropriate name for a dataset variable. @@ -300,7 +300,7 @@ def _write_variable_attributes(self, parent, ncvar, extra=None, omit=()): if not g["dry_run"]: # TODOZARR self._set_attributes(netcdf_attrs, ncvar) - + if skip_set_fill_value: # Re-add as known attribute since this FV is already set netcdf_attrs["_FillValue"] = self.implementation.get_data( @@ -310,14 +310,14 @@ def _write_variable_attributes(self, parent, ncvar, extra=None, omit=()): return netcdf_attrs def _set_attributes(self, attributes, ncvar=None, group=None): - """TODOZARR""" + """TODOZARR.""" g = self.write_vars if ncvar is not None: # Set variable attributes x = g["nc"][ncvar] elif group is not None: # Set group-level attributes - x = group + x = group else: raise ValueError("Must set ncvar or group") @@ -325,9 +325,9 @@ def _set_attributes(self, attributes, ncvar=None, group=None): case "netCDF": x.setncatts(attributes) case "zarr": - print('ATTR=', attributes) - x.update_attributes(attributes) - + print("ATTR=", attributes) + x.update_attributes(attributes) + def _character_array(self, array): """Converts a numpy array of strings to character data type. @@ -446,7 +446,8 @@ def _datatype(self, variable): return f"{dtype.kind}{dtype.itemsize}" def _string_length_dimension(self, size): - """Creates a dataset dimension for string variables if necessary. + """Creates a dataset dimension for string variables if + necessary. :Parameters: @@ -463,9 +464,7 @@ def _string_length_dimension(self, size): # ------------------------------------------------------------ # Create a new dimension for the maximum string length # ------------------------------------------------------------ - ncdim = self._name( - f"strlen{size}", dimsize=size, role="string_length" - ) + ncdim = self._name(f"strlen{size}", dimsize=size, role="string_length") if ncdim not in g["ncdim_to_size"]: # This string length dimension needs creating @@ -477,7 +476,7 @@ def _string_length_dimension(self, size): if not g["dry_run"]: try: -# parent_group.createDimension(ncdim, size) TODOZARR + # parent_group.createDimension(ncdim, size) TODOZARR self._createDimension(parent_group, ncdim, size) except RuntimeError: pass # TODO convert to 'raise' via fixes upstream @@ -485,16 +484,14 @@ def _string_length_dimension(self, size): return ncdim def _createDimension(self, group, ncdim, size): - """TODOZARR - - """ + """TODOZARR.""" match self.write_vars["backend"]: case "netCDF4": group.createDimension(ncdim, size) case "zarr": # Dimensions are not created in Zarr datasets pass - + def _dataset_dimensions(self, field, key, construct): """Returns the dataset dimension names for the construct. @@ -647,8 +644,8 @@ def _write_dimension( # its name with its basename (CF>=1.8) ncdim = self._remove_group_structure(ncdim) - # Dimensions don't get written to Zarr datasets - if not (g["dry_run"] or g['backend'] == 'zarr'): + # Dimensions are not created in Zarr datasets + if not g["dry_run"] and g["backend"] != "zarr": if unlimited: # Create an unlimited dimension size = None @@ -664,9 +661,9 @@ def _write_dimension( if error == "NetCDF: NC_UNLIMITED size already in use": raise RuntimeError( message - + f" In a {g['netcdf'].data_model} file only one " - "unlimited dimension is allowed. Consider using " - "a netCDF4 format." + + f" In a {g['netcdf'].data_model} dataset only " + "one unlimited dimension is allowed. Consider " + "using a netCDF4 format." ) raise RuntimeError(message) @@ -758,13 +755,11 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): # No dataset variable name not correponding to a # dataset dimension name has been set, so create a # default dataset variable name. - ncvar = self._create_variable_name( - coord, default="coordinate" - ) + ncvar = self._create_variable_name(coord, default="coordinate") ncdim = ncvar - # Create a new dataset dimension (null-op for Zarr) + # Create a new dataset dimension unlimited = self._unlimited(f, axis) self._write_dimension(ncdim, f, axis, unlimited=unlimited) @@ -806,9 +801,7 @@ def _write_count_variable( g = self.write_vars if not self._already_in_file(count_variable): - ncvar = self._create_variable_name( - count_variable, default="count" - ) + ncvar = self._create_variable_name(count_variable, default="count") if create_ncdim: ncdim = self._name(ncdim) @@ -883,9 +876,7 @@ def _write_index_variable( g = self.write_vars if not self._already_in_file(index_variable): - ncvar = self._create_variable_name( - index_variable, default="index" - ) + ncvar = self._create_variable_name(index_variable, default="index") if create_ncdim: ncdim = self._name(ncdim) @@ -915,9 +906,7 @@ def _write_list_variable(self, f, list_variable, compress): create = not self._already_in_file(list_variable) if create: - ncvar = self._create_variable_name( - list_variable, default="list" - ) + ncvar = self._create_variable_name(list_variable, default="list") # Create a new dimension self._write_dimension( @@ -1234,7 +1223,7 @@ def _write_geometry_container(self, field, geometry_container): kwargs = { "varname": ncvar, "datatype": "S1", - "dimensions": (), +# "dimensions": (), # TODOZARR "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) @@ -1243,7 +1232,7 @@ def _write_geometry_container(self, field, geometry_container): self._createVariable(**kwargs) # TODOZARR - #g["nc"][ncvar].setncatts(geometry_container) + # g["nc"][ncvar].setncatts(geometry_container) self._set_attributes(ncvar, geometry_container) # Update the 'geometry_containers' dictionary @@ -1333,9 +1322,7 @@ def _write_bounds( # structure from the name. bounds_ncdim = self._remove_group_structure(bounds_ncdim) - bounds_ncdim = self._name( - bounds_ncdim, dimsize=size, role="bounds" - ) + bounds_ncdim = self._name(bounds_ncdim, dimsize=size, role="bounds") # Check if this bounds variable has not been previously # created. @@ -1371,8 +1358,10 @@ def _write_bounds( if not g["dry_run"]: try: - self._createDimension(parent_group, base_bounds_ncdim, size) -# parent_group.createDimension(base_bounds_ncdim, size) + self._createDimension( + parent_group, base_bounds_ncdim, size + ) + # parent_group.createDimension(base_bounds_ncdim, size) except RuntimeError: raise @@ -2132,9 +2121,7 @@ def _write_scalar_coordinate( scalar_coord = self.implementation.squeeze(coord_1d, axes=0) if not self._already_in_file(scalar_coord, ()): - ncvar = self._create_variable_name( - scalar_coord, default="scalar" - ) + ncvar = self._create_variable_name(scalar_coord, default="scalar") # If this scalar coordinate has bounds then create the # bounds dataset variable and add the 'bounds' or # 'climatology' (as appropriate) attribute to the @@ -2229,9 +2216,7 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): f, coord, key, ncdimensions, coord_ncvar=None ) else: - ncvar = self._create_variable_name( - coord, default="auxiliary" - ) + ncvar = self._create_variable_name(coord, default="auxiliary") # TODO: move setting of bounds ncvar to here - why? @@ -2380,9 +2365,7 @@ def _write_field_ancillary( if not create: ncvar = g["seen"][id(anc)]["ncvar"] else: - ncvar = self._create_variable_name( - anc, default="ancillary_data" - ) + ncvar = self._create_variable_name(anc, default="ancillary_data") # Create a new field ancillary variable self._write_netcdf_variable( @@ -2492,12 +2475,17 @@ def _set_external_variables(self, ncvar): external_variables.add(ncvar) if not g["dry_run"] and not g["post_dry_run"]: self._set_attributes( - {"external_variables": " ".join(sorted(external_variables))} - ) -# g["dataset"].setncattr( -# "external_variables", " ".join(sorted(external_variables)) -# ) - + { + "external_variables": " ".join( + sorted(external_variables) + ) + } + ) + + # g["dataset"].setncattr( + # "external_variables", " ".join(sorted(external_variables)) + # ) + def _create_external( self, field=None, construct_id=None, ncvar=None, ncdimensions=None ): @@ -2545,59 +2533,62 @@ def _createVariable(self, **kwargs): .. versionadded:: (cfdm) 1.7.0 """ - g = self.write_vars + g = self.write_vars ncvar = kwargs["varname"] match g["backend"]: case "netCDF4": netcdf4_kwargs = kwargs - # Remove Zarr-specific kwargs - netcdf4_kwargs.pop('shape', None) - netcdf4_kwargs.pop('shards', None) - - if kwargs["contiguous"]: - if g["dataset"].data_model.startswith("NETCDF4"): - # NETCDF4 contiguous variables can't be compressed - kwargs["compression"] = None - kwargs["complevel"] = 0 - - # NETCDF4 contiguous variables can't span unlimited - # dimensions - unlimited_dimensions = ( - g["unlimited_dimensions"].intersection( - kwargs.get("dimensions", ()) - ) + if "dimensions" not in kwargs: + netcdf4_kwargs["dimensions"] = () + + if kwargs["contiguous"] and g["dataset"].data_model.startswith( + "NETCDF4" + ): + # NETCDF4 contiguous variables can't be compressed + kwargs["compression"] = None + kwargs["complevel"] = 0 + + # NETCDF4 contiguous variables can't span unlimited + # dimensions + unlimited_dimensions = g[ + "unlimited_dimensions" + ].intersection(kwargs.get("dimensions", ())) + if unlimited_dimensions: + data_model = g["dataset"].data_model + raise ValueError( + f"Can't create variable {ncvar!r} in " + f"{data_model} dataset: " + f"In {data_model} it is not allowed to write " + "contiguous (as opposed to chunked) data " + "that spans one or more unlimited dimensions: " + f"{unlimited_dimensions}" ) - if unlimited_dimensions: - data_model = g["dataset"].data_model - raise ValueError( - f"Can't create variable {ncvar!r} in " - f"{data_model} dataset from {cfvar!r}: " - f"In {data_model} it is not allowed to write " - "contiguous (as opposed to chunked) data " - "that spans one or more unlimited dimensions: " - f"{unlimited_dimensions}" - ) - + + # Remove Zarr-specific kwargs + netcdf4_kwargs.pop("shape", None) + netcdf4_kwargs.pop("shards", None) + variable = g["dataset"].createVariable(**netcdf4_kwargs) - case "zarr": - print ('kwargs = ', kwargs) - zarr_kwargs = {"name": ncvar, - "shape": kwargs.get('shape', ()), - "dtype": kwargs['datatype'], - "chunks": kwargs.get('chunks', 'auto'), - "shards": kwargs.get('shards'), - "compressors": None, # TODOZARR - "fill_value": kwargs.get("fill_value"), - "dimension_names": kwargs.get('dimensions', ()), - "overwrite": True, - } - print ('zarr_kwargs = ', zarr_kwargs) + case "zarr": + print("kwargs = ", kwargs) + zarr_kwargs = { + "name": ncvar, + "dtype": kwargs["datatype"], + "shape": kwargs.get("shape", ()), + "dimension_names": kwargs.get("dimensions", ()), + "chunks": kwargs.get("chunks", "auto"), + "shards": kwargs.get("shards"), + "compressors": None, # TODOZARR + "fill_value": kwargs.get("fill_value"), + "overwrite": True, + } + print("zarr_kwargs = ", zarr_kwargs) variable = g["dataset"].create_array(**zarr_kwargs) g["nc"][ncvar] = variable - + def _write_grid_mapping(self, f, ref, multiple_grid_mappings): """Write a grid mapping georeference to the dataset. @@ -2638,7 +2629,7 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): kwargs = { "varname": ncvar, "datatype": "S1", - "dimensions": (), +# "dimensions": (), # TODOZARR "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) @@ -2674,7 +2665,7 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): if not g["dry_run"]: # TODOZARR # g["nc"][ncvar].setncatts(parameters) - self._set_attributes(parameters, ncvar ) + self._set_attributes(parameters, ncvar) # Update the 'seen' dictionary g["seen"][id(ref)] = { @@ -2842,7 +2833,7 @@ def _write_netcdf_variable( contiguous, chunksizes = self._chunking_parameters( data, ncdimensions ) - + logger.debug( f" chunksizes: {chunksizes}\n" f" contiguous: {contiguous}" @@ -2872,7 +2863,7 @@ def _write_netcdf_variable( ] # Get shape of arra - + # ------------------------------------------------------------ # Create a new dataset variable # ------------------------------------------------------------ @@ -2889,15 +2880,15 @@ def _write_netcdf_variable( } if data is not None: - compressed=self._compressed_data(ncdimensions) + compressed = self._compressed_data(ncdimensions) if compressed: # Write data in its compressed form shape = data.source().source().shape else: shape = data.shape - kwargs['shape'] = shape - + kwargs["shape"] = shape + # ------------------------------------------------------------ # Create a quantization container variable, add any extra # quantization attributes, and if required instruct @@ -2938,7 +2929,7 @@ def _write_netcdf_variable( f"Can't yet quantize on write {cfvar!r} to a Zarr " "dataset TODOZARR" ) - + # Set "implemention" to this version of the netCDF-C # library self.implementation.set_parameter( @@ -3045,7 +3036,7 @@ def _write_netcdf_variable( # otherwise reflect the aggregated data in memory, # rather than the scalar variable in the file. kwargs["shape"] = () # zarr - kwargs["chunks"] = 'auto' # zarr + kwargs["chunks"] = "auto" # zarr kwargs["shards"] = None # zarr kwargs["dimensions"] = () # netCDF4 + zarr kwargs["contiguous"] = True # netCDF4 @@ -3226,7 +3217,7 @@ def _transform_strings(self, data, ncdimensions): # TODOZARR - consider always writing string arrays in zarr (rather than char arrays) datatype = self._datatype(data) - + if data is not None and datatype == "S1": # -------------------------------------------------------- # Convert a string data type numpy array into a character @@ -3349,7 +3340,7 @@ def _write_data( ) # TODOZARR - print (type(g["nc"][ncvar])) + print(type(g["nc"][ncvar])) da.store(dx, g["nc"][ncvar], compute=True, return_stored=False) def _check_valid(self, array, cfvar=None, attributes=None): @@ -4195,11 +4186,11 @@ def _write_field_or_domain( # "formula_terms", formula_terms # ) self._set_attributes( - {"formula_terms": formula_terms}, ncvar + {"formula_terms": formula_terms}, ncvar ) except KeyError: pass # TODO convert to 'raise' via fixes upstream - + logger.info( " Writing formula_terms attribute to variable " f"{ncvar}: {formula_terms!r}" @@ -4213,12 +4204,12 @@ def _write_field_or_domain( if not g["dry_run"] and not g["post_dry_run"]: try: # TODOZARR -# g["nc"][bounds_ncvar].setncattr( -# "formula_terms", bounds_formula_terms -# ) + # g["nc"][bounds_ncvar].setncattr( + # "formula_terms", bounds_formula_terms + # ) self._set_attributes( {"formula_terms": bounds_formula_terms}, - bounds_ncvar + bounds_ncvar, ) except KeyError: pass # TODO convert to 'raise' via fixes upstream @@ -4495,7 +4486,7 @@ def _unlimited(self, field, axis): """ return self.implementation.nc_is_unlimited_axis(field, axis) - #def _write_group(self, parent_group, group_name): + # def _write_group(self, parent_group, group_name): # """Creates a new parent group object. # # .. versionadded:: (cfdm) 1.8.6.0 @@ -4579,7 +4570,7 @@ def _write_group_attributes(self, fields): f0, attr ) - nc = g["dataset"] # TODOZARR + nc = g["dataset"] # TODOZARR for group in groups: if group in nc.groups: nc = nc.groups[group] @@ -4734,16 +4725,16 @@ def _write_global_attributes(self, fields): if not g["dry_run"] and not g["post_dry_run"]: attrs = {"Conventions": delimiter.join(g["Conventions"])} -# g["dataset"].setncattr( -# "Conventions", delimiter.join(g["Conventions"]) -# ) + # g["dataset"].setncattr( + # "Conventions", delimiter.join(g["Conventions"]) + # ) # ------------------------------------------------------------ # Write the file descriptors to the file # ------------------------------------------------------------ attrs.update(g["file_descriptors"]) -# for attr, value in g["file_descriptors"].items(): -# g["dataset"].setncattr(attr, value) + # for attr, value in g["file_descriptors"].items(): + # g["dataset"].setncattr(attr, value) # ------------------------------------------------------------ # Write other global attributes to the file @@ -4755,18 +4746,18 @@ def _write_global_attributes(self, fields): } ) # for attr in global_attributes - set(("Conventions",)): -# g["dataset"].setncattr( -# attr, self.implementation.get_property(f0, attr) -# ) + # g["dataset"].setncattr( + # attr, self.implementation.get_property(f0, attr) + # ) # ------------------------------------------------------------ # Write "forced" global attributes to the file # ------------------------------------------------------------ attrs.update(force_global) - + self._set_attributes(attrs, group=g["dataset"]) -# for attr, v in force_global.items(): -# g["dataset"].setncattr(attr, v) + # for attr, v in force_global.items(): + # g["dataset"].setncattr(attr, v) g["global_attributes"] = global_attributes @@ -4825,24 +4816,25 @@ def dataset_open(self, filename, mode, fmt, fields): ) g = self.write_vars - match g['backend']: - case 'netCDF4': + match g["backend"]: + case "netCDF4": # mode == 'w' is safer than != 'a' in case of a typo # (the letters are neighbours on a QWERTY keyboard) # since 'w' is destructive. Note that for append # ('a') mode the original file is never wiped. if mode == "w" and g["overwrite"]: - os.remove(filename) - + os.remove(filename) + try: nc = netCDF4.Dataset(filename, mode, format=fmt) except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") - case 'zarr': + case "zarr": nc = zarr.group( - filename, overwrite=g["overwrite"], zarr_format=3) - + filename, overwrite=g["overwrite"], zarr_format=3 + ) + return nc @_manage_log_level_via_verbosity @@ -4873,7 +4865,7 @@ def write( group=True, coordinates=False, omit_data=None, - dataset_chunks="4MiB", + dataset_chunks="4MiB", dataset_shards=None, cfa="auto", reference_datetime=None, ): @@ -5114,6 +5106,12 @@ def write( The dataset chunking strategy. The default value is "4MiB". See `cfdm.write` for details. + dataset_shards: `str`, `int`, or `float`, optional + The Zarr dataset sharding strategy. The default value + is `None`. See `cfdm.write` for details. + + .. versionadded:: (cfdm) NEXTVERSION + cfa: `dict` or `None`, optional Configure the creation of aggregation variables. See `cfdm.write` for details. @@ -5247,9 +5245,10 @@ def write( # dataset variable "cfa_write_status": {}, # -------------------------------------------------------- - # Dataset chunking stategy + # Dataset chunking and sharding stategy # -------------------------------------------------------- "dataset_chunks": dataset_chunks, + "dataset_shards": dataset_shards, # -------------------------------------------------------- # Quantization: Store unique Quantization objects, keyed # by their output dataset variable names. @@ -5277,8 +5276,6 @@ def write( f"{dataset_chunks!r}." ) - - # ------------------------------------------------------------ # Parse the 'cfa' keyword # ------------------------------------------------------------ @@ -5584,10 +5581,10 @@ def _file_io_iteration( g["fmt"] = fmt if fmt == "ZARR3": - g['backend'] = 'zarr' + g["backend"] = "zarr" else: - g['backend'] = 'netCDF4' - + g["backend"] = "netCDF4" + if isinstance( fields, ( @@ -5813,10 +5810,10 @@ def _chunking_parameters(self, data, ncdimensions): chunksizes = self.implementation.nc_get_dataset_chunksizes(data) if chunksizes == "contiguous": # Contiguous as defined by 'data' - if g['zarr']: + if g["zarr"]: # Return a single chunk return False, self._shape_in_dataset(data, ncdimensions) - + return True, None # Still here? @@ -5832,10 +5829,10 @@ def _chunking_parameters(self, data, ncdimensions): # dataset_chunks if dataset_chunks == "contiguous": # Contiguous as defined by 'dataset_chunks' - if g['zarr']: + if g["zarr"]: # Return a single chunk return False, self._shape_in_dataset(data, ncdimensions) - + return True, None # Still here? Then work out the chunks from both the @@ -5866,14 +5863,14 @@ def _chunking_parameters(self, data, ncdimensions): return True, None def _shape_in_dataset(self, data, ncdimensions): - """TODOZARR""" + """TODOZARR.""" if self._compressed_data(ncdimensions): d = self.implementation.get_compressed_array(data) else: d = data - + return d.shape - + def _compressed_data(self, ncdimensions): """Whether or not the data is being written in compressed form. @@ -6581,7 +6578,7 @@ def _write_quantization_container(self, quantization): kwargs = { "varname": ncvar, "datatype": "S1", - "dimensions": (), +# "dimensions": (), # TODOZARR "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) @@ -6591,10 +6588,10 @@ def _write_quantization_container(self, quantization): self._createVariable(**kwargs) # Set the attributes -# g["nc"][ncvar].setncatts( -# self.implementation.parameters(quantization) -# ) - # TODOZARR + # g["nc"][ncvar].setncatts( + # self.implementation.parameters(quantization) + # ) + # TODOZARR self._set_attributes( self.implementation.parameters(quantization), ncvar ) diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 4893dc9bf..8595fdda0 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -611,6 +611,9 @@ class write(ReadWrite): .. versionadded:: (cfdm) 1.12.0.0 + dataset_shards: `str` or `int` or `float`, optional + TODOZARR + cfa: `str` or `dict` or `None`, optional Specify which netCDF variables, if any, should be written as CF-netCDF aggregation variables. @@ -849,6 +852,6 @@ def __new__( coordinates=coordinates, extra_write_vars=extra_write_vars, omit_data=omit_data, - dataset_chunks=dataset_chunks, + dataset_chunks=dataset_chunks, dataset_shards =dataset_shards, cfa=cfa, ) From 0af9c7695f7d2dee3615d1f81bb75d485334455c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 26 Aug 2025 23:25:19 +0100 Subject: [PATCH 05/39] dev --- cfdm/data/netcdfindexer.py | 4 +- cfdm/read_write/netcdf/netcdfread.py | 70 ++-- cfdm/read_write/netcdf/netcdfwrite.py | 470 +++++++++++++++----------- cfdm/read_write/write.py | 88 ++--- 4 files changed, 358 insertions(+), 274 deletions(-) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index dd873a374..e0d07ec52 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -392,7 +392,7 @@ def _default_FillValue(self, dtype): The default ``_FillValue``. """ - if dtype.kind in "OS": + if dtype.kind in "OST": return default_fillvals["S1"] return default_fillvals[dtype.str[1:]] @@ -618,7 +618,7 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): if fvalisnan: mask = np.isnan(data) else: - mask = data == fval + mask = np.asanyarray(data == fval) if mask.any(): if fill_value is None: diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 685687e3d..9b976c2b3 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -8178,36 +8178,46 @@ def _create_Data( `Data` """ - if array.dtype is None: - g = self.read_vars - if g["has_groups"]: - group, name = self._netCDF4_group( - g["variable_grouped_dataset"][ncvar], ncvar - ) - variable = group.variables.get(name) - else: - variable = g["variables"].get(ncvar) - - array = variable[...] - - string_type = isinstance(array, str) - if string_type: - # A netCDF string type scalar variable comes out as Python - # str object, so convert it to a numpy array. - array = np.array(array, dtype=f"U{len(array)}") - - if not variable.ndim: - # NetCDF4 has a thing for making scalar size 1 - # variables into 1d arrays - array = array.squeeze() - - if not string_type: - # An N-d (N>=1) netCDF string type variable comes out - # as a numpy object array, so convert it to numpy - # string array. - array = array.astype("U", copy=False) - # NetCDF4 doesn't auto-mask VLEN variables - array = np.ma.where(array == "", np.ma.masked, array) + g = self.read_vars + match g["nc_opened_with"]: + case 'zarr': + if array.dtype == np.dtypes.StringDType(): + array = array.astype("O", copy=False).astype("U", copy=False) + array = np.ma.masked_values(array, "") + + case _: + if array.dtype is None: + if g["has_groups"]: + group, name = self._netCDF4_group( + g["variable_grouped_dataset"][ncvar], ncvar + ) + variable = group.variables.get(name) + else: + variable = g["variables"].get(ncvar) + + array = variable[...] + + string_type = isinstance(array, str) + if string_type: + # A netCDF string type scalar variable comes + # out as Python str object, so convert it to a + # numpy array. + array = np.array(array, dtype=f"U{len(array)}") + + if not variable.ndim: + # NetCDF4 has a thing for making scalar size 1 + # variables into 1d arrays + array = array.squeeze() + + if not string_type: + # An N-d (N>=1) netCDF string type variable + # comes out as a numpy object array, so + # convert it to numpy string array. + array = array.astype("U", copy=False) + # netCDF4 doesn't auto-mask VLEN variables + # array = np.ma.where(array == "", np.ma.masked, array) + array = np.ma.masked_values(array, "") + # Set the dask chunking strategy chunks = self._dask_chunks( diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 8a3c97757..cf463abb4 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -105,6 +105,11 @@ def _createGroup(self, parent, group_name): case "zarr": return parent.create_group(group_name) + case _: + raise ValueError( + f"Bad backend: {self.write_vars['backend']!r}" + ) # pragma: no cover + def _create_variable_name(self, parent, default): """Create an appropriate name for a dataset variable. @@ -298,7 +303,6 @@ def _write_variable_attributes(self, parent, ncvar, extra=None, omit=()): del netcdf_attrs["_FillValue"] if not g["dry_run"]: - # TODOZARR self._set_attributes(netcdf_attrs, ncvar) if skip_set_fill_value: @@ -310,7 +314,28 @@ def _write_variable_attributes(self, parent, ncvar, extra=None, omit=()): return netcdf_attrs def _set_attributes(self, attributes, ncvar=None, group=None): - """TODOZARR.""" + """Set dataset attributes on a variable or group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + attributes: `dict` + The attributes. + + ncvar: `str`, optional + The variable on which to set the attributes. Must be + set if *group* is `None`. + + group: `str`, optional + The group on which to set the attributes. Must be set + if *ncvar* is `None`. + + :Returns: + + `None` + + """ g = self.write_vars if ncvar is not None: # Set variable attributes @@ -322,11 +347,12 @@ def _set_attributes(self, attributes, ncvar=None, group=None): raise ValueError("Must set ncvar or group") match g["backend"]: - case "netCDF": + case "netCDF4": x.setncatts(attributes) case "zarr": - print("ATTR=", attributes) x.update_attributes(attributes) + case _: + raise ValueError(f"Bad backend: {g['backend']!r}") def _character_array(self, array): """Converts a numpy array of strings to character data type. @@ -428,13 +454,20 @@ def _datatype(self, variable): if not isinstance(variable, np.ndarray): data = self.implementation.get_data(variable, None) if data is None: + if fmt == "ZARR3": + return str + return "S1" else: data = variable dtype = getattr(data, "dtype", None) if dtype is None or dtype.kind in "SU": - if g["fmt"] == "NETCDF4" and g["string"]: + fmt = g["fmt"] + if fmt == "NETCDF4" and g["string"]: + return str + + if fmt == "ZARR3": return str return "S1" @@ -446,8 +479,9 @@ def _datatype(self, variable): return f"{dtype.kind}{dtype.itemsize}" def _string_length_dimension(self, size): - """Creates a dataset dimension for string variables if - necessary. + """Return a dataset dimension for string variables. + + The dataset dimension will be created, if required. :Parameters: @@ -476,7 +510,6 @@ def _string_length_dimension(self, size): if not g["dry_run"]: try: - # parent_group.createDimension(ncdim, size) TODOZARR self._createDimension(parent_group, ncdim, size) except RuntimeError: pass # TODO convert to 'raise' via fixes upstream @@ -484,13 +517,36 @@ def _string_length_dimension(self, size): return ncdim def _createDimension(self, group, ncdim, size): - """TODOZARR.""" + """Create a dataset dimension in group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: `netCDF.Dataset` or `netCDF.Group` or `zarr.Group` + The group in which to create the dimension. + + ncdim: `str` + The name of the dimension in the group. + + size: `int` + The size of the dimension. + + :Returns: + + `None` + + """ match self.write_vars["backend"]: case "netCDF4": group.createDimension(ncdim, size) case "zarr": # Dimensions are not created in Zarr datasets pass + case _: + raise ValueError( + f"Bad backend: {self.write_vars['backend']!r}" + ) # pragma: no cover def _dataset_dimensions(self, field, key, construct): """Returns the dataset dimension names for the construct. @@ -1223,17 +1279,13 @@ def _write_geometry_container(self, field, geometry_container): kwargs = { "varname": ncvar, "datatype": "S1", -# "dimensions": (), # TODOZARR "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) if not g["dry_run"]: self._createVariable(**kwargs) - - # TODOZARR - # g["nc"][ncvar].setncatts(geometry_container) - self._set_attributes(ncvar, geometry_container) + self._set_attributes(geometry_container, ncvar) # Update the 'geometry_containers' dictionary g["geometry_containers"][ncvar] = geometry_container @@ -2479,7 +2531,8 @@ def _set_external_variables(self, ncvar): "external_variables": " ".join( sorted(external_variables) ) - } + }, + group=g["dataset"], ) # g["dataset"].setncattr( @@ -2534,6 +2587,7 @@ def _createVariable(self, **kwargs): """ g = self.write_vars + ncvar = kwargs["varname"] match g["backend"]: @@ -2541,10 +2595,10 @@ def _createVariable(self, **kwargs): netcdf4_kwargs = kwargs if "dimensions" not in kwargs: netcdf4_kwargs["dimensions"] = () - - if kwargs["contiguous"] and g["dataset"].data_model.startswith( - "NETCDF4" - ): + + if kwargs.get("contiguous") and g[ + "dataset" + ].data_model.startswith("NETCDF4"): # NETCDF4 contiguous variables can't be compressed kwargs["compression"] = None kwargs["complevel"] = 0 @@ -2565,27 +2619,39 @@ def _createVariable(self, **kwargs): f"{unlimited_dimensions}" ) - # Remove Zarr-specific kwargs + # Remove any Zarr-specific kwargs netcdf4_kwargs.pop("shape", None) netcdf4_kwargs.pop("shards", None) variable = g["dataset"].createVariable(**netcdf4_kwargs) - case "zarr": - print("kwargs = ", kwargs) + case "zarr": + shape = kwargs.get("shape", ()) + chunks = kwargs.get("chunksizes", "auto") + if chunks is None or not shape: + chunks = shape + + dtype = kwargs["datatype"] + if dtype == "S1": + dtype = str + zarr_kwargs = { "name": ncvar, - "dtype": kwargs["datatype"], - "shape": kwargs.get("shape", ()), - "dimension_names": kwargs.get("dimensions", ()), - "chunks": kwargs.get("chunks", "auto"), + "shape": shape, + "dtype": dtype, + "chunks": chunks, "shards": kwargs.get("shards"), - "compressors": None, # TODOZARR "fill_value": kwargs.get("fill_value"), - "overwrite": True, + "dimension_names": kwargs.get("dimensions", ()), + "storage_options": g.get("storage_options"), + "overwrite": g["overwrite"], } print("zarr_kwargs = ", zarr_kwargs) variable = g["dataset"].create_array(**zarr_kwargs) + print('___________') + + case _: + raise ValueError(f"Bad backend: {g['backend']!r}") g["nc"][ncvar] = variable @@ -2629,13 +2695,12 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): kwargs = { "varname": ncvar, "datatype": "S1", -# "dimensions": (), # TODOZARR "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) if not g["dry_run"]: - self._createVariable(kwargs) + self._createVariable(**kwargs) # Add named parameters parameters = self.implementation.get_datum_parameters(ref) @@ -2663,8 +2728,6 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): parameters[term] = value if not g["dry_run"]: - # TODOZARR - # g["nc"][ncvar].setncatts(parameters) self._set_attributes(parameters, ncvar) # Update the 'seen' dictionary @@ -2819,7 +2882,7 @@ def _write_netcdf_variable( cfvar, "_FillValue", None ) else: - fill_value = None + fill_value = None # ppp if data_variable: lsd = g["least_significant_digit"] @@ -2926,8 +2989,8 @@ def _write_netcdf_variable( if quantize_on_write: if g["backend"] == "zarr": raise NotImplementedError( - f"Can't yet quantize on write {cfvar!r} to a Zarr " - "dataset TODOZARR" + f"Can't yet quantize-on-write {cfvar!r} to a Zarr " + "dataset" ) # Set "implemention" to this version of the netCDF-C @@ -3021,7 +3084,7 @@ def _write_netcdf_variable( if g["cfa"].get("strict", True): # Raise the exception in 'strict' mode if g["mode"] == "w": - os.remove(g["filename"]) + self.dataset_remove() raise @@ -3035,12 +3098,11 @@ def _write_netcdf_variable( # dimensions and dataset chunking strategy will # otherwise reflect the aggregated data in memory, # rather than the scalar variable in the file. - kwargs["shape"] = () # zarr - kwargs["chunks"] = "auto" # zarr - kwargs["shards"] = None # zarr - kwargs["dimensions"] = () # netCDF4 + zarr - kwargs["contiguous"] = True # netCDF4 - kwargs["chunksizes"] = None # netCDF4 + kwargs["contiguous"] = True + kwargs["chunksizes"] = None + kwargs["dimensions"] = () + kwargs["shape"] = () + kwargs["shards"] = None # Add compression parameters (but not for scalars or vlen # strings). @@ -3064,29 +3126,6 @@ def _write_netcdf_variable( f" to variable: {ncvar}({', '.join(ncdimensions)})" ) # pragma: no cover - # Adjust createVariable arguments for contiguous variables - # TODOZARR - moved to `_createVariable` - # if kwargs["contiguous"]: - # if g["dataset"].data_model.startswith("NETCDF4"): - # # NETCDF4 contiguous variables can't span unlimited - # # dimensions - # unlimited_dimensions = g["unlimited_dimensions"].intersection( - # kwargs["dimensions"] - # ) - # if unlimited_dimensions: - # data_model = g["dataset"].data_model - # raise ValueError( - # f"Can't create variable {ncvar!r} in {data_model} " - # f"file from {cfvar!r}: In {data_model} it is not " - # "allowed to write contiguous (as opposed to chunked) " - # "data that spans one or more unlimited dimensions: " - # f"{unlimited_dimensions}" - # ) - # - # # NETCDF4 contiguous variables can't be compressed - # kwargs["compression"] = None - # kwargs["complevel"] = 0 - try: self._createVariable(**kwargs) except RuntimeError as error: @@ -3214,8 +3253,6 @@ def _transform_strings(self, data, ncdimensions): `Data`, `tuple` """ - # TODOZARR - consider always writing string arrays in zarr (rather than char arrays) - datatype = self._datatype(data) if data is not None and datatype == "S1": @@ -3291,6 +3328,7 @@ def _write_data( `None` """ + print ('ncvar=', ncvar, repr(data)) g = self.write_vars if cfa: @@ -3304,10 +3342,12 @@ def _write_data( # Still here? The write a normal (non-aggregation) variable # ------------------------------------------------------------ if compressed: - # Write data in its compressed form + # Write data in its compressed form data = data.source().source() + print ('compressed' ,repr(data)) # Get the dask array + print('data.fill_value', data._FillValue) dx = da.asanyarray(data) # Convert the data type @@ -3315,6 +3355,7 @@ def _write_data( if new_dtype is not None: dx = dx.astype(new_dtype) + # VLEN variables can not be assigned to by masked arrays # (https://github.com/Unidata/netcdf4-python/pull/465), so # fill missing data in string (as opposed to char) data types. @@ -3338,9 +3379,8 @@ def _write_data( attributes=attributes, meta=np.array((), dx.dtype), ) - - # TODOZARR - print(type(g["nc"][ncvar])) + print('dx', repr(dx), dx.compute()) + print('ertertertr', repr(g["nc"][ncvar])) da.store(dx, g["nc"][ncvar], compute=True, return_stored=False) def _check_valid(self, array, cfvar=None, attributes=None): @@ -3401,7 +3441,7 @@ def _check_valid(self, array, cfvar=None, attributes=None): print( message.format( cfvar, - self.write_vars["filename"], + self.write_vars["dataset_name"], "less", "minimum", prop, @@ -3424,7 +3464,7 @@ def _check_valid(self, array, cfvar=None, attributes=None): print( message.format( cfvar, - self.write_vars["filename"], + self.write_vars["dataset_name"], "greater", "maximum", prop, @@ -3484,6 +3524,7 @@ def _write_field_or_domain( """ g = self.write_vars + ncdim_size_to_spanning_constructs = [] seen = g["seen"] @@ -4181,10 +4222,6 @@ def _write_field_or_domain( formula_terms = " ".join(formula_terms) if not g["dry_run"] and not g["post_dry_run"]: try: - # TODOZARR - # g["nc"][ncvar].setncattr( - # "formula_terms", formula_terms - # ) self._set_attributes( {"formula_terms": formula_terms}, ncvar ) @@ -4203,10 +4240,6 @@ def _write_field_or_domain( bounds_formula_terms = " ".join(bounds_formula_terms) if not g["dry_run"] and not g["post_dry_run"]: try: - # TODOZARR - # g["nc"][bounds_ncvar].setncattr( - # "formula_terms", bounds_formula_terms - # ) self._set_attributes( {"formula_terms": bounds_formula_terms}, bounds_ncvar, @@ -4486,29 +4519,6 @@ def _unlimited(self, field, axis): """ return self.implementation.nc_is_unlimited_axis(field, axis) - # def _write_group(self, parent_group, group_name): - # """Creates a new parent group object. - # - # .. versionadded:: (cfdm) 1.8.6.0 - # - # :Parameters: - # - # parent_group: `netCDF4.Dateset` or `netCDF4.Group` or `Zarr.Group` - # - # group_name: `str` - # - # :Returns: - # - # `netCDF4.Group` or `zarr.Group` - # - # """ - # backend = self.write_vars['backend'] - # if backend == 'netCDF4': - # return parent_group.createGroup(group_name) - # - # if backend == 'zarr': - # return parent_group.create_group(group_name) - def _write_group_attributes(self, fields): """Writes the group-level attributes to the file. @@ -4761,7 +4771,51 @@ def _write_global_attributes(self, fields): g["global_attributes"] = global_attributes - def dataset_close(self, filename): + def dataset_exists(self, dataset): + """Whether or not a dataset exists on disk. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `str` + The name of the dataset. + + :Returns: + + `bool` + Whether or not the dataset exists on disk. + + """ + match self.write_vars["dataset_type"]: + case "file": + return os.path.isfile(dataset) + + case "directory": + return os.path.isdir(dataset) + + def dataset_remove(self): + """Remove the dataset that is being created. + + .. note:: If the dataset is a directory, then it is silently + not removed. To do so could be very dangerous (what + if it were your home space?). + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `None` + + """ + g = self.write_vars + match g["dataset_type"]: + case "file": + os.remove(g["dataset_name"]) + case "directory": + pass + + def dataset_close(self): """Close the dataset that has been written. .. versionadded:: (cfdm) 1.7.0 @@ -4775,31 +4829,30 @@ def dataset_close(self, filename): if g["backend"] == "netCDF4": g["dataset"].close() - def dataset_open(self, filename, mode, fmt, fields): + def dataset_open(self, dataset_name, mode, fmt, fields): """Open the dataset for writing. .. versionadded:: (cfdm) 1.7.0 :Parameters: - filename: `str` - As for the *filename* parameter for initialising a - `netCDF.Dataset` instance. + dataset_name: `str` + The dataset to open. mode: `str` As for the *mode* parameter for initialising a - `netCDF.Dataset` instance. + `netCDF4.Dataset` instance. fmt: `str` As for the *format* parameter for initialising a - `netCDF.Dataset` instance. + `netCDF4.Dataset` instance. Ignored for Zarr datasets. fields: sequence of `Field` or `Domain` The constructs to be written to the netCDF file. Note that these constructs are only used to ascertain if - any data to be written is in *filename*. If this is - the case and mode is "w" then an exception is raised - to prevent *filename* from being deleted. + any data to be written is in *dataset_name*. If this + is the case and mode is "w" then an exception is + raised to prevent *dataset_name* from being deleted. :Returns: @@ -4807,33 +4860,39 @@ def dataset_open(self, filename, mode, fmt, fields): """ if fields and mode == "w": - filename = os.path.abspath(filename) + dataset_name = os.path.abspath(dataset_name) for f in fields: - if filename in self.implementation.get_original_filenames(f): + if dataset_name in self.implementation.get_original_filenames( + f + ): raise ValueError( - "Can't write with mode 'w' to a file that contains " - f"data that needs to be read: {f!r} uses {filename}" + "Can't write with mode 'w' to a dataset that contains " + f"data which needs to be read: {f!r} uses " + f"{dataset_name}" ) g = self.write_vars + + # mode == 'w' is safer than != 'a' in case of a typo (the + # letters are neighbours on a QWERTY keyboard) since 'w' is + # destructive. Note that for append ('a') mode the original + # file is never wiped. + if mode == "w" and g["overwrite"]: + self.dataset_remove() + match g["backend"]: case "netCDF4": - # mode == 'w' is safer than != 'a' in case of a typo - # (the letters are neighbours on a QWERTY keyboard) - # since 'w' is destructive. Note that for append - # ('a') mode the original file is never wiped. - if mode == "w" and g["overwrite"]: - os.remove(filename) - try: - nc = netCDF4.Dataset(filename, mode, format=fmt) + nc = netCDF4.Dataset(dataset_name, mode, format=fmt) except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") + raise RuntimeError(f"{error}: {dataset_name}") case "zarr": nc = zarr.group( - filename, overwrite=g["overwrite"], zarr_format=3 + dataset_name, overwrite=g["overwrite"], zarr_format=3 ) + case _: + raise ValueError(f"Bad backend: {g['backend']!r}") return nc @@ -4841,7 +4900,7 @@ def dataset_open(self, filename, mode, fmt, fields): def write( self, fields, - filename, + dataset_name, fmt="NETCDF4", mode="w", overwrite=True, @@ -4865,7 +4924,8 @@ def write( group=True, coordinates=False, omit_data=None, - dataset_chunks="4MiB", dataset_shards=None, + dataset_chunks="4MiB", + dataset_shards=None, cfa="auto", reference_datetime=None, ): @@ -4894,8 +4954,8 @@ def write( See `cfdm.write` for details. - filename: str - The output CF-netCDF file. + dataset_name: str + The output CF-netCDF file. TODOZARR See `cfdm.write` for details. @@ -5003,7 +5063,8 @@ def write( See `cfdm.write` for details. endian: `str`, optional - The endian-ness of the output file. + The endian-ness of the output file. Ignored for Zarr + datasets. See `cfdm.write` for details. @@ -5013,14 +5074,15 @@ def write( See `cfdm.write` for details. least_significant_digit: `int`, optional - Truncate the input field construct data arrays, but not - the data arrays of metadata constructs. + Truncate the input field construct data arrays, but + not the data arrays of metadata constructs. Ignored + for Zarr datasets. See `cfdm.write` for details. chunk_cache: `int` or `None`, optional - The amount of memory (in bytes) used in each - variable's chunk cache at the HDF5 level. + The amount of memory (in bytes) used in each HDF5 + variable's chunk cache. Ignored for Zarr datasets. See `cfdm.write` for details. @@ -5029,16 +5091,17 @@ def write( fletcher32: `bool`, optional If True then the Fletcher-32 HDF5 checksum algorithm is activated to detect compression errors. Ignored if - *compress* is ``0``. + *compress* is ``0``. Ignored for Zarr datasets. See `cfdm.write` for details. shuffle: `bool`, optional - If True (the default) then the HDF5 shuffle filter (which - de-interlaces a block of data before compression by - reordering the bytes by storing the first byte of all of a - variable's values in the chunk contiguously, followed by - all the second bytes, and so on) is turned off. + If True (the default) then the HDF5 shuffle filter + (which de-interlaces a block of data before + compression by reordering the bytes by storing the + first byte of all of a variable's values in the chunk + contiguously, followed by all the second bytes, and so + on) is turned off. Ignored for Zarr datasets. See `cfdm.write` for details. @@ -5130,8 +5193,8 @@ def write( logger.info(f"Writing to {fmt}") # pragma: no cover # Expand file name - filename = os.path.expanduser(os.path.expandvars(filename)) - filename = abspath(filename) + dataset_name = os.path.expanduser(os.path.expandvars(dataset_name)) + dataset_name = abspath(dataset_name) # Parse the 'omit_data' parameter if omit_data is None: @@ -5153,7 +5216,7 @@ def write( # Initialise netCDF write parameters # ------------------------------------------------------------ self.write_vars = { - "filename": filename, + "dataset_name": dataset_name, # Format of output file "fmt": None, # netCDF4.Dataset instance @@ -5276,6 +5339,16 @@ def write( f"{dataset_chunks!r}." ) + # Parse the 'dataset_shards' parameter + if dataset_shards is not None: + try: + self.write_vars["dataset_shards"] = parse_bytes(dataset_shards) + except (ValueError, AttributeError): + raise ValueError( + "Invalid value for the 'dataset_shards' keyword: " + f"{dataset_shards!r}." + ) + # ------------------------------------------------------------ # Parse the 'cfa' keyword # ------------------------------------------------------------ @@ -5327,12 +5400,13 @@ def write( if mode == "a": # First read in the fields from the existing file: effective_fields = self._NetCDFRead(self.implementation).read( - filename, netcdf_backend="netCDF4" + dataset_name, netcdf_backend="netCDF4" ) # Read rather than append for the first iteration to ensure nothing # gets written; only want to update the 'seen' dictionary first. effective_mode = "r" + overwrite = False self.write_vars["dry_run"] = True @@ -5392,7 +5466,7 @@ def write( mode=effective_mode, overwrite=overwrite, fields=effective_fields, - filename=filename, + dataset_name=dataset_name, fmt=fmt, global_attributes=global_attributes, variable_attributes=variable_attributes, @@ -5425,7 +5499,7 @@ def write( mode=mode, overwrite=overwrite, fields=fields, - filename=filename, + dataset_name=dataset_name, fmt=fmt, global_attributes=global_attributes, variable_attributes=variable_attributes, @@ -5451,7 +5525,7 @@ def _file_io_iteration( mode, overwrite, fields, - filename, + dataset_name, fmt, global_attributes, variable_attributes, @@ -5582,8 +5656,10 @@ def _file_io_iteration( g["fmt"] = fmt if fmt == "ZARR3": g["backend"] = "zarr" + g["dataset_type"] = "directory" else: g["backend"] = "netCDF4" + g["dataset_type"] = "file" if isinstance( fields, @@ -5612,23 +5688,23 @@ def _file_io_iteration( # ------------------------------------------------------------ # Open the output dataset # ------------------------------------------------------------ - if os.path.isfile(filename): + if self.dataset_exists(dataset_name): if mode == "w" and not overwrite: raise IOError( - "Can't write with mode {mode!r} to existing file " - f"{os.path.abspath(filename)} unless overwrite=True" + "Can't write with mode {mode!r} to existing dataset " + f"{os.path.abspath(dataset_name)} unless overwrite=True" ) - if not os.access(filename, os.W_OK): + if not os.access(dataset_name, os.W_OK): raise IOError( - "Can't write to existing file " - f"{os.path.abspath(filename)} without permission" + "Can't write to existing dataset " + f"{os.path.abspath(dataset_name)} without permission" ) else: g["overwrite"] = False - g["filename"] = filename - g["dataset"] = self.dataset_open(filename, mode, fmt, fields) + g["dataset_name"] = dataset_name + g["dataset"] = self.dataset_open(dataset_name, mode, fmt, fields) if not g["dry_run"]: # -------------------------------------------------------- @@ -5658,10 +5734,10 @@ def _file_io_iteration( ) external = os.path.expanduser(os.path.expandvars(external)) - if os.path.realpath(external) == os.path.realpath(filename): + if os.path.realpath(external) == os.path.realpath(dataset_name): raise ValueError( - "Can't set filename and external to the " "same path" - ) + "Can't set dataset_name and external to the same path" + ) # TODOZARR g["external_file"] = external @@ -5677,7 +5753,7 @@ def _file_io_iteration( # For append mode, it is cleaner code-wise to close the file # on the read iteration and re-open it for the append # iteration. So we always close it here. - self.dataset_close(filename) + self.dataset_close() # ------------------------------------------------------------ # Write external fields to the external file @@ -5685,7 +5761,7 @@ def _file_io_iteration( if g["external_fields"] and g["external_file"] is not None: self.write( fields=g["external_fields"], - filename=g["external_file"], + dataset_name=g["external_file"], fmt=fmt, overwrite=overwrite, datatype=datatype, @@ -5810,10 +5886,6 @@ def _chunking_parameters(self, data, ncdimensions): chunksizes = self.implementation.nc_get_dataset_chunksizes(data) if chunksizes == "contiguous": # Contiguous as defined by 'data' - if g["zarr"]: - # Return a single chunk - return False, self._shape_in_dataset(data, ncdimensions) - return True, None # Still here? @@ -5829,10 +5901,6 @@ def _chunking_parameters(self, data, ncdimensions): # dataset_chunks if dataset_chunks == "contiguous": # Contiguous as defined by 'dataset_chunks' - if g["zarr"]: - # Return a single chunk - return False, self._shape_in_dataset(data, ncdimensions) - return True, None # Still here? Then work out the chunks from both the @@ -5862,14 +5930,20 @@ def _chunking_parameters(self, data, ncdimensions): # data contiguously. return True, None - def _shape_in_dataset(self, data, ncdimensions): - """TODOZARR.""" - if self._compressed_data(ncdimensions): - d = self.implementation.get_compressed_array(data) - else: - d = data - - return d.shape + # def _shape_in_dataset(self, data, ncdimensions): + # """TODOZARR.""" + # if data is not None: + # # Get the shape from the data array + # if self._compressed_data(ncdimensions): + # d = self.implementation.get_compressed_array(data) + # else: + # d = data + # + # return d.shape + # + # # Still here? Then there's no data, so get the shape from the + # # netCDF dimensions + # return tuple([g['ncdim_to_size'][ncdim] for ncdim in ncdimensions]) def _compressed_data(self, ncdimensions): """Whether or not the data is being written in compressed form. @@ -6351,9 +6425,10 @@ def _cfa_fragment_array_variables(self, data, cfvar): if not data.nc_get_aggregation_write_status(): raise AggregationError( f"Can't write {cfvar!r} as a CF aggregation variable. " - "This is probably because some fragment values have been " - "changed relative to those in the fragment files, or a " - "rechunking has occured." + "This is could be " + "because some fragment values in memory have been " + "changed relative to those in the fragment files, " + "or a Dask rechunking has occured, etc." ) # ------------------------------------------------------------ @@ -6389,7 +6464,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): # URI aggregation_file_directory = g["aggregation_file_directory"] if aggregation_file_directory is None: - uri = urisplit(dirname(g["filename"])) + uri = urisplit(dirname(g["dataset_name"])) if uri.isuri(): aggregation_file_scheme = uri.scheme aggregation_file_directory = uri.geturi() @@ -6417,7 +6492,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): # fragment file fragment = data[index].compute(_force_to_memory=False) try: - filename, address, is_subspace, f_index = ( + dataset_name, address, is_subspace, f_index = ( fragment.get_filename(normalise=normalise), fragment.get_address(), fragment.is_subspace(), @@ -6432,9 +6507,9 @@ def _cfa_fragment_array_variables(self, data, cfvar): f"The Dask chunk in position {position} " f"(defined by data index {index!r}) does not " "reference a unique fragment file. This is could be " - "because some fragment values have been changed " - "relative to those in the fragment files, or a " - "Dask rechunking has occured, etc." + "because some fragment values in memory have been " + "changed relative to those in the fragment files, " + "or a Dask rechunking has occured, etc." ) if is_subspace: @@ -6447,16 +6522,16 @@ def _cfa_fragment_array_variables(self, data, cfvar): f"(defined by data index {index!r}) references " f"a subspace ({f_index!r}) of the fragment file " f"{fragment!r}. This might be fixable by setting " - "the 'cfa_write' parameter to the 'read' function." + "the 'cfa_write' keyword in the 'read' function." ) - uri = urisplit(filename) + uri = urisplit(dataset_name) if uri_relative and uri.isrelpath(): - filename = abspath(filename) + dataset_name = abspath(dataset_name) if uri.isabspath(): # File name is an absolute-path URI reference - filename = uricompose( + dataset_name = uricompose( scheme="file", authority="", path=abspath(uri.path), @@ -6479,11 +6554,11 @@ def _cfa_fragment_array_variables(self, data, cfvar): f"({aggregation_file_scheme}:) is incompatible." ) - filename = relpath( - filename, start=aggregation_file_directory + dataset_name = relpath( + dataset_name, start=aggregation_file_directory ) - aggregation_uris.append(filename) + aggregation_uris.append(dataset_name) aggregation_identifiers.append(address) # Reshape the 1-d aggregation instruction arrays to span @@ -6578,7 +6653,6 @@ def _write_quantization_container(self, quantization): kwargs = { "varname": ncvar, "datatype": "S1", -# "dimensions": (), # TODOZARR "endian": g["endian"], } kwargs.update(g["netcdf_compression"]) @@ -6586,12 +6660,6 @@ def _write_quantization_container(self, quantization): if not g["dry_run"]: # Create the variable self._createVariable(**kwargs) - - # Set the attributes - # g["nc"][ncvar].setncatts( - # self.implementation.parameters(quantization) - # ) - # TODOZARR self._set_attributes( self.implementation.parameters(quantization), ncvar ) diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 8595fdda0..a9a5e6eaf 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -81,14 +81,12 @@ class write(ReadWrite): :Parameters: fields: (sequence of) `Field` or `Domain` - The field and domain constructs to write to the file. + The field and domain constructs to write to the dataset. - filename: `str` - The output netCDF file name. Various type of expansion are - applied to the file names. - - Relative paths are allowed, and standard tilde and shell - parameter expansions are applied to the string. + dataset_name: `str` + The output dataset name. Relative paths are allowed, and + standard tilde and shell parameter expansions are applied + to the string. *Parameter example:* The file ``file.nc`` in the user's home directory could @@ -100,7 +98,7 @@ class write(ReadWrite): The format of the output file. One of: ========================== ============================== - *fmt* Output file type + *fmt* Output dataset type ========================== ============================== ``'NETCDF4'`` NetCDF4 format file. This is the default. @@ -121,12 +119,14 @@ class write(ReadWrite): ``'NETCDF3_64BIT_DATA'`` NetCDF3 64-bit offset format file with extensions (see below) + + ``'ZARR3'`` Zarr v3 ========================== ============================== By default the format is ``'NETCDF4'``. - All formats support large files (i.e. those greater than - 2GB) except ``'NETCDF3_CLASSIC'``. + All NETCDF formats support large files (i.e. those greater + than 2GB) except ``'NETCDF3_CLASSIC'``. ``'NETCDF3_64BIT_DATA'`` is a format that requires version 4.4.0 or newer of the C library (use @@ -146,7 +146,8 @@ class write(ReadWrite): and use the new features of the version 4 API. mode: `str`, optional - Specify the mode of write access for the output file. One of: + Specify the mode of write access for the output + datset. One of: ======== ================================================= *mode* Description @@ -197,12 +198,12 @@ class write(ReadWrite): ======== ================================================= - By default the file is opened with write access mode + By default the dataset is opened with write access mode ``'w'``. overwrite: `bool`, optional - If False then raise an error if the output file - pre-exists. By default a pre-existing output file is + If False then raise an error if the output dataset + pre-exists. By default a pre-existing output dataset is overwritten. Conventions: (sequence of) `str`, optional @@ -261,8 +262,8 @@ class write(ReadWrite): construct properties, which are created as netCDF global attributes by default: - * the description of file contents properties (as defined - by the CF conventions), and + * the description of dataset contents properties (as + defined by the CF conventions), and * properties flagged as global on any of the field constructs being written (see @@ -278,7 +279,7 @@ class write(ReadWrite): data variable corresponding to each field construct that contains the property. - Any global attributes that are also specified as file + Any global attributes that are also specified as dataset descriptors will not be written as netCDF global variables, but as netCDF data variable attributes instead. @@ -316,8 +317,8 @@ class write(ReadWrite): external: `str`, optional Write metadata constructs that have data and are marked as - external to the named external file. Ignored if there are - no such constructs. + external to the named external dataset. Ignored if there + are no such constructs. datatype: `dict`, optional Specify data type conversions to be applied prior to @@ -343,7 +344,7 @@ class write(ReadWrite): ``'little'``, ``'big'`` or ``'native'``. By default the output is native endian. See the `netCDF4 package `_ for more - details. + details. Ignored for Zarr datsets. *Parameter example:* ``endian='big'`` @@ -374,8 +375,8 @@ class write(ReadWrite): ``least_significant_digit=3`` chunk_cache: `int` or `None`, optional - The amount of memory (in bytes) used in each variable's - chunk cache at the HDF5 level. + The amount of memory (in bytes) used in each HDF5 + variable's chunk cache. Ignored when not writing to a netCDF-4 format. By default, or if `None`, the default netCDF-C chunk cache size of @@ -420,14 +421,14 @@ class write(ReadWrite): string: `bool`, optional By default string-valued construct data are written as - netCDF arrays of type string if the output file format is - ``'NETCDF4'`` or ``'ZARR3'``, or of type char with an + netCDF arrays of type string if the output dataset format + is ``'NETCDF4'`` or ``'ZARR3'``, or of type char with an extra dimension denoting the maximum string length for any - other output file format (see the *fmt* parameter). If + other output dataset format (see the *fmt* parameter). If *string* is False then string-valued construct data are written as netCDF arrays of type char with an extra dimension denoting the maximum string length, regardless - of the selected output file format. + of the selected output dataset format. .. versionadded:: (cfdm) 1.8.0 @@ -469,7 +470,7 @@ class write(ReadWrite): The consequence of writing out-of-range data values is that, by default, these values will be masked when the - file is subsequently read. + dataset is subsequently read. *Parameter example:* If a construct has ``valid_max`` property with value @@ -480,11 +481,11 @@ class write(ReadWrite): .. versionadded:: (cfdm) 1.8.3 group: `bool`, optional - If False then create a "flat" netCDF file, i.e. one with - only the root group, regardless of any group structure + If False then create a "flat" dataset, i.e. one with only + the root group, regardless of any group structure specified by the field constructs. By default any groups - defined by the netCDF interface of the field constructs and - its components will be created and populated. + defined by the netCDF interface of the field constructs + and its components will be created and populated. .. versionadded:: (cfdm) 1.8.6 @@ -500,11 +501,11 @@ class write(ReadWrite): Do not write the data of the named construct types. This does not affect the amount of netCDF variables and - dimensions that are written to the file, nor the netCDF + dimensions that are written to the dataset, nor the netCDF variables' attributes, but does not create data on disk - for the requested variables. The resulting file will be + for the requested variables. The resulting dataset will be smaller than it otherwise would have been, and when the - new file is read the data of these variables will be + new dataset is read the data of these variables will be represented by an array of all missing data. The *omit_data* parameter may be one, or a sequence, of: @@ -535,8 +536,8 @@ class write(ReadWrite): .. versionadded:: (cfdm) 1.10.0.1 dataset_chunks: `str` or `int` or `float`, optional - The dataset chunking strategy for data arrays being written - to the file. + The dataset chunking strategy for data arrays being + written to the dataset. By default, *dataset_chunks* is ``'4 MiB'``, i.e. 4194304 bytes. @@ -613,7 +614,7 @@ class write(ReadWrite): dataset_shards: `str` or `int` or `float`, optional TODOZARR - + cfa: `str` or `dict` or `None`, optional Specify which netCDF variables, if any, should be written as CF-netCDF aggregation variables. @@ -743,6 +744,9 @@ class write(ReadWrite): Define the CF data model implementation that defines field and metadata constructs and their components. + filename: Deprecated at version NEXTVERSION + Use *dataset_name* instead. + :Returns: `None` @@ -764,7 +768,7 @@ class write(ReadWrite): def __new__( cls, fields, - filename, + dataset_name, fmt="NETCDF4", mode="w", overwrite=True, @@ -790,10 +794,11 @@ def __new__( coordinates=False, omit_data=None, dataset_chunks="4 MiB", + dataset_shards=None, cfa="auto", extra_write_vars=None, ): - """Write field and domain constructs to a netCDF file.""" + """Write field and domain constructs to a dataset.""" # Flatten the sequence of intput fields fields = tuple(cls._flat(fields)) if not fields: @@ -828,7 +833,7 @@ def __new__( netcdf = NetCDFWrite(cls.implementation) netcdf.write( fields, - filename, + dataset_name, fmt=fmt, mode=mode, overwrite=overwrite, @@ -852,6 +857,7 @@ def __new__( coordinates=coordinates, extra_write_vars=extra_write_vars, omit_data=omit_data, - dataset_chunks=dataset_chunks, dataset_shards =dataset_shards, + dataset_chunks=dataset_chunks, + dataset_shards=dataset_shards, cfa=cfa, ) From 31d1c83d522230eda39d9ac674797a66353582c8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 26 Aug 2025 23:25:32 +0100 Subject: [PATCH 06/39] dev --- cfdm/test/test_zarr.py | 140 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 cfdm/test/test_zarr.py diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py new file mode 100644 index 000000000..29d5dc26c --- /dev/null +++ b/cfdm/test/test_zarr.py @@ -0,0 +1,140 @@ +import atexit +import datetime +import faulthandler +import os +import platform +import shutil +import subprocess +import tempfile +import unittest + +import netCDF4 +import numpy as np + +faulthandler.enable() # to debug seg faults and timeouts + +import cfdm +from cfdm.read_write.exceptions import DatasetTypeError, ReadError + +warnings = False + +# Set up temporary directories +n_tmp = 9 +tmpdirs = [ + tempfile.mkdtemp("_test_zarr.zarr", dir=os.getcwd()) + for i in range(n_tmp) +] +[ + tmp1, + tmp2, + tmp3, + tmp4, + tmp5, + tmp6, + tmp7, + tmp8, + tmp9, +] = tmpdirs + +# Set up temporary files +n_tmpfiles = 1 +tmpfiles = [ + tempfile.mkstemp("_test_zarr.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +[tmpfile] = tmpfiles + + +def _remove_tmpdirs(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + for d in tmpdirs: + try: + shutil.rmtree(d) + os.rmdir(d) + except OSError: + pass + + +atexit.register(_remove_tmpdirs) + +filename = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "test_file.nc" +) + + +class read_writeTest(unittest.TestCase): + """Test the reading and writing of field constructs from/to disk.""" + + filename = filename + + zarr2 = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "example_field_0.zarr2" + ) + + zarr3 = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "example_field_0.zarr3" + ) + + f0 = cfdm.example_field(0) + f1 = cfdm.example_field(1) + + string_filename = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "string_char.nc" + ) + + netcdf3_fmts = [ + "NETCDF3_CLASSIC", + "NETCDF3_64BIT", + "NETCDF3_64BIT_OFFSET", + "NETCDF3_64BIT_DATA", + ] + netcdf4_fmts = ["NETCDF4", "NETCDF4_CLASSIC"] + netcdf_fmts = netcdf3_fmts + netcdf4_fmts + + def setUp(self): + """Preparations called immediately before each test method.""" + # Disable log messages to silence expected warnings + cfdm.LOG_LEVEL("DISABLE") + # Note: to enable all messages for given methods, lines or + # calls (those without a 'verbose' option to do the same) + # e.g. to debug them, wrap them (for methods, start-to-end + # internally) as follows: cfdm.LOG_LEVEL('DEBUG') + # + # < ... test code ... > + # cfdm.log_level('DISABLE') + + def test_read_write_zarr_1(self): + """Test the writing of a named netCDF file.""" + i = 0 + for f in cfdm.example_fields(0, 1, 2, 3): + print ('\n\n==================================', i) + print(f) + tmp1 = 'tmp.zarr' + cfdm.write(f, tmp1, fmt='ZARR3') + g = cfdm.read(tmp1, verbose=1) + self.assertEqual(len(g) , 1) + g = g[0] + print(g) + + self.assertTrue(g.equals(f, verbose=1)) + + print ('\n\n eq1 done\n\n') + # Check that the Zarr and netCDF4 encoding contain the + # same information + tmpfile = 'delme.nc' + cfdm.write(f, tmpfile, fmt='NETCDF4') + n = cfdm.read(tmpfile)[0] + self.assertTrue(g.equals(n)) + + i += 1 +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cfdm.environment() + print("") + unittest.main(verbosity=2) From 76736e8c1aab727eb522394edfc0273e2f97f8f8 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 27 Aug 2025 13:45:34 +0100 Subject: [PATCH 07/39] dev --- Changelog.rst | 9 ++ cfdm/__init__.py | 2 +- cfdm/data/aggregatedarray.py | 9 +- cfdm/data/fragment/__init__.py | 1 + cfdm/data/fragment/fragmentfilearray.py | 7 +- cfdm/data/fragment/fragmentzarrarray.py | 10 ++ cfdm/data/netcdfindexer.py | 6 +- cfdm/read_write/netcdf/netcdfread.py | 90 +++++++++------- cfdm/read_write/netcdf/netcdfwrite.py | 131 ++++++++++++++++++++---- cfdm/test/test_Data.py | 4 +- cfdm/test/test_zarr.py | 113 +++++++++++--------- docs/source/installation.rst | 2 +- requirements.txt | 2 +- 13 files changed, 273 insertions(+), 113 deletions(-) create mode 100644 cfdm/data/fragment/fragmentzarrarray.py diff --git a/Changelog.rst b/Changelog.rst index 407a63fe4..2882a952e 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,3 +1,12 @@ +Version NEXTVERSION +---------------- + +**2025-??-??** + +* Changed dependency: ``zarr>=3.1.2`` + +---- + Version 1.12.3.0 ---------------- diff --git a/cfdm/__init__.py b/cfdm/__init__.py index 54e43089e..ac515cefc 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -120,7 +120,7 @@ except ImportError as error1: raise ImportError(_error0 + str(error1)) else: - _minimum_vn = "3.0.8" + _minimum_vn = "3.1.2" if Version(zarr.__version__) < Version(_minimum_vn): raise ValueError( f"Bad zarr version: cfdm requires zarr>={_minimum_vn}. " diff --git a/cfdm/data/aggregatedarray.py b/cfdm/data/aggregatedarray.py index 77f75ea2c..6a5d62516 100644 --- a/cfdm/data/aggregatedarray.py +++ b/cfdm/data/aggregatedarray.py @@ -263,9 +263,16 @@ def _parse_fragment_array(self, aggregated_filename, fragment_array): if not scalar: identifier = fa_identifiers[index].item() + uri = fa_uris[index] + try: + uri = uri.item() + except AttributeError: + # E.g. if 'uri' is a `str` instance + pass + parsed_fragment_array[index] = { "map": shape, - "uri": fa_uris[index].item(), + "uri": uri, "identifier": identifier, } else: diff --git a/cfdm/data/fragment/__init__.py b/cfdm/data/fragment/__init__.py index 8c7c76bd0..e4edf076c 100644 --- a/cfdm/data/fragment/__init__.py +++ b/cfdm/data/fragment/__init__.py @@ -2,3 +2,4 @@ from .fragmenth5netcdfarray import FragmentH5netcdfArray from .fragmentnetcdf4array import FragmentNetCDF4Array from .fragmentuniquevaluearray import FragmentUniqueValueArray +from .fragmentzarrarray import FragmentZarrArray diff --git a/cfdm/data/fragment/fragmentfilearray.py b/cfdm/data/fragment/fragmentfilearray.py index f8bc37cac..6338dcabd 100644 --- a/cfdm/data/fragment/fragmentfilearray.py +++ b/cfdm/data/fragment/fragmentfilearray.py @@ -27,12 +27,17 @@ def __new__(cls, *args, **kwargs): """ # Import fragment classes. Do this here (as opposed to outside # the class) to aid subclassing. - from . import FragmentH5netcdfArray, FragmentNetCDF4Array + from . import ( + FragmentH5netcdfArray, + FragmentNetCDF4Array, + FragmentZarrArray, + ) instance = super().__new__(cls) instance._FragmentArrays = ( FragmentNetCDF4Array, FragmentH5netcdfArray, + FragmentZarrArray, ) return instance diff --git a/cfdm/data/fragment/fragmentzarrarray.py b/cfdm/data/fragment/fragmentzarrarray.py new file mode 100644 index 000000000..3ea293df2 --- /dev/null +++ b/cfdm/data/fragment/fragmentzarrarray.py @@ -0,0 +1,10 @@ +from ..zarrarray import ZarrArray +from .mixin import FragmentFileArrayMixin + + +class FragmentZarrArray(FragmentFileArrayMixin, ZarrArray): + """A fragment of aggregated data in a file accessed with `zarr`. + + .. versionadded:: (cfdm) NEXTVERSION + + """ diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index e0d07ec52..846e2bfdd 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -392,9 +392,13 @@ def _default_FillValue(self, dtype): The default ``_FillValue``. """ - if dtype.kind in "OST": + kind = dtype.kind + if kind in "OS": return default_fillvals["S1"] + if kind == "T": + return "" + return default_fillvals[dtype.str[1:]] def _index(self, index, data=None): diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 9b976c2b3..db7238087 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -707,7 +707,7 @@ def _open_zarr(self, dataset): error.msg += ". Install the 'zarr' package to read Zarr datasets" raise - nc = zarr.open(dataset) + nc = zarr.open(dataset, mode="r") self.read_vars["dataset_opened_with"] = "zarr" return nc @@ -7091,7 +7091,7 @@ def _create_data( # # b) Cached values are never really required for # compression index data. - self._cache_data_elements(data, ncvar) + self._cache_data_elements(data, ncvar, attributes) # ------------------------------------------------------------ # Set data aggregation parameters @@ -8179,12 +8179,14 @@ def _create_Data( """ g = self.read_vars - match g["nc_opened_with"]: - case 'zarr': + match g["nc_opened_with"]: + case "zarr": if array.dtype == np.dtypes.StringDType(): - array = array.astype("O", copy=False).astype("U", copy=False) + array = array.astype("O", copy=False).astype( + "U", copy=False + ) array = np.ma.masked_values(array, "") - + case _: if array.dtype is None: if g["has_groups"]: @@ -8194,21 +8196,21 @@ def _create_Data( variable = group.variables.get(name) else: variable = g["variables"].get(ncvar) - + array = variable[...] - + string_type = isinstance(array, str) if string_type: # A netCDF string type scalar variable comes # out as Python str object, so convert it to a # numpy array. array = np.array(array, dtype=f"U{len(array)}") - + if not variable.ndim: # NetCDF4 has a thing for making scalar size 1 # variables into 1d arrays array = array.squeeze() - + if not string_type: # An N-d (N>=1) netCDF string type variable # comes out as a numpy object array, so @@ -8217,7 +8219,6 @@ def _create_Data( # netCDF4 doesn't auto-mask VLEN variables # array = np.ma.where(array == "", np.ma.masked, array) array = np.ma.masked_values(array, "") - # Set the dask chunking strategy chunks = self._dask_chunks( @@ -11316,8 +11317,6 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): else: dask_chunks = g.get("dask_chunks", "storage-aligned") - storage_chunks = self._netcdf_chunksizes(g["variables"][ncvar]) - # ------------------------------------------------------------ # None # ------------------------------------------------------------ @@ -11325,6 +11324,8 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): # No Dask chunking return -1 + storage_chunks = self._dataset_chunksizes(g["variables"][ncvar]) + ndim = array.ndim if ( storage_chunks is not None @@ -11610,7 +11611,7 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): # ------------------------------------------------------------ return dask_chunks - def _cache_data_elements(self, data, ncvar): + def _cache_data_elements(self, data, ncvar, attributes=None): """Cache selected element values. Updates *data* in-place to store its first, second, @@ -11691,6 +11692,15 @@ def _cache_data_elements(self, data, ncvar): else: char = False + variable = netcdf_indexer( + variable, + mask=True, + unpack=True, + always_masked_array=False, + orthogonal_indexing=False, + attributes=attributes, + copy=False, + ) if ndim == 1: # Also cache the second element for 1-d data, on the # assumption that they may well be dimension coordinate @@ -11706,6 +11716,7 @@ def _cache_data_elements(self, data, ncvar): else: indices = (0, 1, -1) values = (variable[:1], variable[1:2], variable[-1:]) + elif ndim == 2 and data.shape[-1] == 2: # Assume that 2-d data with a last dimension of size 2 # contains coordinate bounds, for which it is useful to @@ -11742,28 +11753,29 @@ def _cache_data_elements(self, data, ncvar): ) # Create a dictionary of the element values - elements = {} - for index, value in zip(indices, values): - if obj: - value = value.astype(str) - elif string: - # Convert an array of objects to an array of strings - value = np.array(value, dtype="U") - elif char: - # Variable is a netCDF classic style char array, so - # collapse (by concatenation) the outermost (fastest - # varying) dimension. E.g. [['a','b','c']] becomes - # ['abc'] - if dtype.kind == "U": - value = value.astype("S") - - a = netCDF4.chartostring(value) - shape = a.shape - a = np.array([x.rstrip() for x in a.flat]) - a = np.reshape(a, shape) - value = np.ma.masked_where(a == "", a) - - elements[index] = value + elements = {index: value for index, value in zip(indices, values)} + # for index, value in zip(indices, values): + # print (repr(value)) + # if obj: + # value = value.astype(str) + # elif string: + # # Convert an array of objects to an array of strings + # value = np.array(value, dtype="U") + # elif char: + # # Variable is a netCDF classic style char array, so + # # collapse (by concatenation) the outermost (fastest + # # varying) dimension. E.g. [['a','b','c']] becomes + # # ['abc'] + # if dtype.kind == "U": + # value = value.astype("S") + # print ('value=', value, value.dtype) + # a = netCDF4.chartostring(value) + # shape = a.shape + # a = np.array([x.rstrip() for x in a.flat]) + # a = np.reshape(a, shape) + # value = np.ma.masked_where(a == "", a) + + # elements[index] = value # Cache the cached data elements for this variable g["cached_data_elements"][ncvar] = elements @@ -11771,7 +11783,7 @@ def _cache_data_elements(self, data, ncvar): # Store the elements in the data object data._set_cached_elements(elements) - def _netcdf_chunksizes(self, variable): + def _dataset_chunksizes(self, variable): """Return the variable chunk sizes. .. versionadded:: (cfdm) 1.11.2.0 @@ -11804,8 +11816,10 @@ def _netcdf_chunksizes(self, variable): if chunks == "contiguous": chunks = None except AttributeError: - # h5netcdf + # h5netcdf, zarr chunks = variable.chunks + if not chunks: + chunks = None return chunks diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index cf463abb4..86b32d2f9 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -6,7 +6,6 @@ import dask.array as da import netCDF4 import numpy as np -import zarr from dask import config as dask_config from dask.array.core import normalize_chunks from dask.utils import parse_bytes @@ -350,6 +349,12 @@ def _set_attributes(self, attributes, ncvar=None, group=None): case "netCDF4": x.setncatts(attributes) case "zarr": + # `zarr` can't encode numpy arrays in the zarr.json + # file + for attr, value in attributes.items(): + if isinstance(value, np.ndarray): + attributes[attr] = value.tolist() + x.update_attributes(attributes) case _: raise ValueError(f"Bad backend: {g['backend']!r}") @@ -454,7 +459,7 @@ def _datatype(self, variable): if not isinstance(variable, np.ndarray): data = self.implementation.get_data(variable, None) if data is None: - if fmt == "ZARR3": + if g["fmt"] == "ZARR3": return str return "S1" @@ -463,7 +468,7 @@ def _datatype(self, variable): dtype = getattr(data, "dtype", None) if dtype is None or dtype.kind in "SU": - fmt = g["fmt"] + fmt = g["fmt"] if fmt == "NETCDF4" and g["string"]: return str @@ -2634,7 +2639,7 @@ def _createVariable(self, **kwargs): dtype = kwargs["datatype"] if dtype == "S1": dtype = str - + zarr_kwargs = { "name": ncvar, "shape": shape, @@ -2648,7 +2653,7 @@ def _createVariable(self, **kwargs): } print("zarr_kwargs = ", zarr_kwargs) variable = g["dataset"].create_array(**zarr_kwargs) - print('___________') + print("___________") case _: raise ValueError(f"Bad backend: {g['backend']!r}") @@ -2860,6 +2865,7 @@ def _write_netcdf_variable( # Do this after the dry_run return else may attempt to transform # the arrays with string dtype on an append-mode read iteration (bad). + datatype = None if not domain_variable: datatype = self._datatype(cfvar) data, ncdimensions = self._transform_strings( @@ -2875,14 +2881,22 @@ def _write_netcdf_variable( # filled before any data is written. if the fill value is # False then the variable is not pre-filled. # ------------------------------------------------------------ - if ( - omit_data or fill or g["post_dry_run"] - ): # or append mode's appending iteration - fill_value = self.implementation.get_property( - cfvar, "_FillValue", None - ) - else: - fill_value = None # ppp + match g["backend"]: + case "netCDF4": + if ( + omit_data or fill or g["post_dry_run"] + ): # or append mode's appending iteration + fill_value = self.implementation.get_property( + cfvar, "_FillValue", None + ) + else: + fill_value = None + + case "zarr": + # Set the `zarr` fill_value to the missing value of + # 'cfvar', defaulting to the netCDF default fill value + # if no missing value is available + fill_value = self._missing_value(cfvar, datatype) if data_variable: lsd = g["least_significant_digit"] @@ -3328,7 +3342,6 @@ def _write_data( `None` """ - print ('ncvar=', ncvar, repr(data)) g = self.write_vars if cfa: @@ -3342,12 +3355,10 @@ def _write_data( # Still here? The write a normal (non-aggregation) variable # ------------------------------------------------------------ if compressed: - # Write data in its compressed form + # Write data in its compressed form data = data.source().source() - print ('compressed' ,repr(data)) # Get the dask array - print('data.fill_value', data._FillValue) dx = da.asanyarray(data) # Convert the data type @@ -3355,7 +3366,6 @@ def _write_data( if new_dtype is not None: dx = dx.astype(new_dtype) - # VLEN variables can not be assigned to by masked arrays # (https://github.com/Unidata/netcdf4-python/pull/465), so # fill missing data in string (as opposed to char) data types. @@ -3379,10 +3389,41 @@ def _write_data( attributes=attributes, meta=np.array((), dx.dtype), ) - print('dx', repr(dx), dx.compute()) - print('ertertertr', repr(g["nc"][ncvar])) + + if g["backend"] == "zarr": + # `zarr` can't write a masked array to a variable, so we + # have to replace missing data with the fill value. + dx = dx.map_blocks( + self._filled_array, + meta=np.array((), dx.dtype), + fill_value=g["nc"][ncvar].fill_value, + ) + da.store(dx, g["nc"][ncvar], compute=True, return_stored=False) + def _filled_array(self, array, fill_value): + """Replace masked values with a fill value. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + array: `numpy.ndarray` + The arry to be filled. + + fill_value: + The fill value. + + :Returns: + + `numpy.ndarray` + + """ + if np.ma.isMA(array): + return array.filled(fill_value) + + return array + def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. @@ -4888,6 +4929,14 @@ def dataset_open(self, dataset_name, mode, fmt, fields): raise RuntimeError(f"{error}: {dataset_name}") case "zarr": + try: + import zarr + except ModuleNotFoundError as error: + error.msg += ( + ". Install the 'zarr' package to write Zarr datasets" + ) + raise + nc = zarr.group( dataset_name, overwrite=g["overwrite"], zarr_format=3 ) @@ -6668,3 +6717,45 @@ def _write_quantization_container(self, quantization): g["quantization"][ncvar] = quantization return ncvar + + def _missing_value(self, x, datatype): + """Get the missing value. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + x: construct or `Data` + The data for which to get the missing value. + + datatype: `str` or str + The data type, e.g. ``'S1'``, ``'f4'``, `str`. Used + to get the netCDF default fill value, but only when a + missing value can't be found from the attributes of + *x*. + + :Returns: + + The missing value, or `None` if no missing value could + be found. + + """ + try: + # Try 'x' as a construct + mv = x.get_property("_FillValue", None) + if mv is None: + mv = x.get_property("missing_value", None) + except AttributeError: + try: + # Try 'x' as a `Data` object + mv = getattr(x, "fill_value", None) + except AttributeError: + mv = None + + if mv is None: + # Try to get the netCDF default fill value + mv = netCDF4.default_fillvals.get(datatype) + if mv is None and datatype is str: + mv = "" + + return mv diff --git a/cfdm/test/test_Data.py b/cfdm/test/test_Data.py index 96cfc2e39..9047567e9 100644 --- a/cfdm/test/test_Data.py +++ b/cfdm/test/test_Data.py @@ -1377,7 +1377,9 @@ def test_Data_masked_values(self): d = cfdm.Data(array) e = d.masked_values(1.1) ea = e.array - a = np.ma.masked_values(array, 1.1, rtol=cfdm.rtol(), atol=cfdm.atol()) + a = np.ma.masked_values( + array, 1.1, rtol=float(cfdm.rtol()), atol=float(cfdm.atol()) + ) self.assertTrue(np.isclose(ea, a).all()) self.assertTrue((ea.mask == a.mask).all()) self.assertIsNone(d.masked_values(1.1, inplace=True)) diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index 29d5dc26c..44a905a54 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -2,47 +2,27 @@ import datetime import faulthandler import os -import platform import shutil -import subprocess import tempfile import unittest -import netCDF4 -import numpy as np - faulthandler.enable() # to debug seg faults and timeouts import cfdm -from cfdm.read_write.exceptions import DatasetTypeError, ReadError warnings = False # Set up temporary directories -n_tmp = 9 tmpdirs = [ - tempfile.mkdtemp("_test_zarr.zarr", dir=os.getcwd()) - for i in range(n_tmp) + tempfile.mkdtemp("_test_zarr.zarr", dir=os.getcwd()) for i in range(2) ] -[ - tmp1, - tmp2, - tmp3, - tmp4, - tmp5, - tmp6, - tmp7, - tmp8, - tmp9, -] = tmpdirs +[tmpdir1, tmpdir2] = tmpdirs # Set up temporary files -n_tmpfiles = 1 tmpfiles = [ - tempfile.mkstemp("_test_zarr.nc", dir=os.getcwd())[1] - for i in range(n_tmpfiles) + tempfile.mkstemp("_test_zarr.nc", dir=os.getcwd())[1] for i in range(2) ] -[tmpfile] = tmpfiles +[tmpfile1, tmpfile2] = tmpfiles def _remove_tmpdirs(): @@ -109,30 +89,67 @@ def setUp(self): # < ... test code ... > # cfdm.log_level('DISABLE') - def test_read_write_zarr_1(self): - """Test the writing of a named netCDF file.""" - i = 0 - for f in cfdm.example_fields(0, 1, 2, 3): - print ('\n\n==================================', i) - print(f) - tmp1 = 'tmp.zarr' - cfdm.write(f, tmp1, fmt='ZARR3') - g = cfdm.read(tmp1, verbose=1) - self.assertEqual(len(g) , 1) - g = g[0] - print(g) - - self.assertTrue(g.equals(f, verbose=1)) - - print ('\n\n eq1 done\n\n') - # Check that the Zarr and netCDF4 encoding contain the - # same information - tmpfile = 'delme.nc' - cfdm.write(f, tmpfile, fmt='NETCDF4') - n = cfdm.read(tmpfile)[0] - self.assertTrue(g.equals(n)) - - i += 1 + def test_zarr_read_write_1(self): + """Test Zarr read/write on example fields.""" + for i, f in enumerate(cfdm.example_fields()): + if i in (8, 9, 10): + # Can't write UGRID yet + continue + + cfdm.write(f, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1) + self.assertEqual(len(z), 1) + z = z[0] + self.assertTrue(z.equals(f)) + + # Check that the Zarr and netCDF4 encodings are equivalent + tmpfile1 = "delme.nc" + cfdm.write(f, tmpfile1, fmt="NETCDF4") + n = cfdm.read(tmpfile1)[0] + self.assertTrue(z.equals(n)) + + def test_zarr_read_write_2(self): + """Test Zarr read/write on test netCDF files.""" + for filename in ( + "DSG_timeSeries_contiguous.nc", + "DSG_timeSeries_indexed.nc", + "DSG_timeSeriesProfile_indexed_contiguous.nc", + "gathered.nc", + "geometry_1.nc", + "geometry_2.nc", + "geometry_3.nc", + "geometry_4.nc", + "string_char.nc", + ): + n = cfdm.read(filename) + cfdm.write(n, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1) + self.assertEqual(len(z), len(n)) + for a, b in zip(z, n): + self.assertTrue(a.equals(b)) + + def test_zarr_read_write_CFA(self): + """Test CF aggreagtion in Zarr.""" + f = self.f0 + cfdm.write(f, tmpdir1, fmt="ZARR3") + cfdm.write(f, tmpfile1, fmt="NETCDF4") + + z = cfdm.read(tmpdir1, cfa_write="field")[0] + n = cfdm.read(tmpfile1, cfa_write="field")[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + cfdm.write(z, tmpdir2, fmt="ZARR3", cfa="field") + cfdm.write(n, tmpfile2, fmt="NETCDF4", cfa="field") + + z = cfdm.read(tmpdir2)[0] + n = cfdm.read(tmpfile2)[0] + + self.assertTrue(z.equals(f)) + self.assertTrue(z.equals(n)) + + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) cfdm.environment() diff --git a/docs/source/installation.rst b/docs/source/installation.rst index f46120ecf..1aeebe9d3 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -195,7 +195,7 @@ The cfdm package requires: * `h5py `_, version 3.12.1 or newer. -* `zarr `_, version 3.0.8 or newer. +* `zarr `_, version 3.1.2 or newer. * `s3fs `_, version 2024.6.0 or newer. diff --git a/requirements.txt b/requirements.txt index 3b37323a5..768a5bac3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ dask>=2025.5.1 distributed>=2025.5.1 uritools>=4.0.3 cfunits>=3.3.7 -zarr>=3.0.8 +zarr>=3.1.2 From b0de4b3ffb43d0f2216eb887985fbeb20e0a6204 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 28 Aug 2025 00:30:31 +0100 Subject: [PATCH 08/39] dev --- cfdm/cfdmimplementation.py | 17 +++ cfdm/data/data.py | 6 +- cfdm/mixin/netcdf.py | 178 +++++++++++++++++++++++++- cfdm/read_write/netcdf/netcdfwrite.py | 107 ++++++++++------ cfdm/test/test_Data.py | 34 +++++ cfdm/test/test_zarr.py | 53 ++++---- 6 files changed, 326 insertions(+), 69 deletions(-) diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 11c6fefce..669e62266 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1103,6 +1103,23 @@ def nc_get_dataset_chunksizes(self, data): """ return data.nc_dataset_chunksizes() + def nc_get_dataset_shards(self, data): + """Get the dataset sharding strategy for the data. + + ..versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + data: `Data` + + :Returns: + + `int` or `tuple` or `None` + The dataset sharding strategy. + + """ + return data.nc_dataset_shards() + def nc_get_sample_dimension(self, count, default=None): """Return the name of the netCDF sample dimension. diff --git a/cfdm/data/data.py b/cfdm/data/data.py index ba88554f0..e0bb7721b 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -28,7 +28,7 @@ ) from ..mixin.container import Container from ..mixin.files import Files -from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks +from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks, ZarrShards from ..units import Units from .abstract import Array from .creation import to_dask @@ -55,7 +55,9 @@ logger = logging.getLogger(__name__) -class Data(Container, NetCDFAggregation, NetCDFChunks, Files, core.Data): +class Data( + Container, NetCDFAggregation, NetCDFChunks, ZarrShards, Files, core.Data +): """An N-dimensional data array with units and masked values. * Contains an N-dimensional, indexable and broadcastable array with diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index 426c05754..0d6c85f86 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -32,7 +32,7 @@ def __initialise_from_source(self, source, copy=True): :Parameters: source: - The object from which to extract the initialisation + N The object from which to extract the initialisation information. Typically, but not necessarily, a `{{class}}` object. @@ -5129,3 +5129,179 @@ def nc_set_aggregation_write_status(self, status): ) self._nc_set_aggregation_write_status(status) + + +class ZarrShards(NetCDFMixin): + """Mixin class for accessing dataset shard size. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def nc_dataset_shards(self, todict=False): + """Get the dataset shard size for the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_clear_dataset_shards`, + `nc_set_dataset_shards`, `{{package}}.write` + + :Parameters: + + {{chunk todict: `bool`, optional}} + + :Returns: + + {{Returns nc_dataset_chunksizes}} + + **Examples** + + >>> d.shape + (1, 96, 73) + >>> d.nc_set_dataset_chunksizes([1, 35, 73]) + >>> d.nc_dataset_chunksizes() + (1, 35, 73) + >>> d.nc_dataset_chunksizes(todict=True) + {0: 1, 1: 35, 2: 73} + >>> d.nc_clear_dataset_chunksizes() + (1, 35, 73) + >>> d.nc_dataset_chunksizes() + None + >>> d.nc_set_dataset_chunksizes('contiguous') + >>> d.nc_dataset_chunksizes() + 'contiguous' + >>> d.nc_set_dataset_chunksizes('1 KiB') + >>> d.nc_dataset_chunksizes() + 1024 + >>> d.nc_set_dataset_chunksizes(None) + >>> d.nc_dataset_chunksizes() + None + + """ + return self._get_netcdf().get("dataset_shards") + + def nc_clear_dataset_shards(self): + """Clear the dataset shard size for the data. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_dataset_shards`, `nc_set_dataset_shards`, + `{{package}}.write` + + :Returns: + + `None` or `str` or `int` or `tuple` of `int` + The chunking strategy prior to being cleared, as would + be returned by `nc_dataset_chunksizes`. + + **Examples** + + >>> d.shape + (1, 96, 73) + >>> d.nc_set_dataset_chunksizes([1, 35, 73]) + >>> d.nc_clear_dataset_chunksizes() + (1, 35, 73) + >>> d.nc_set_dataset_chunksizes('1 KiB') + >>> d.nc_clear_dataset_chunksizes() + 1024 + >>> d.nc_set_dataset_chunksizes(None) + >>> print(d.nc_clear_dataset_chunksizes()) + None + + """ + return self._get_netcdf().pop("dataset_shards", None) + + def nc_set_dataset_shards(self, shards): + """Set the dataset sharding strategy for the data. + + The sharding strategy is either the integer number of chunks + stored in a single storage object (e.g. a file), or else + `None` to indicate that there is no sharding (i.e. each chunk + is stored in a different storage object). + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_dataset_shards`, `nc_clear_dataset_shards`, + `{{package}}.write` + + :Parameters: + + {{chunk chunksizes}} + + Each dictionary key is an integer that specifies an + axis by its position in the data array. + + :Returns: + + `None` + + **Examples** + + >>> d.shape + (1, 96, 73) + >>> d.nc_set_dataset_chunksizes([1, 35, 73]) + >>> d.nc_dataset_chunksizes() + (1, 35, 73) + >>> d.nc_clear_dataset_chunksizes() + (1, 35, 73) + >>> d.nc_dataset_chunksizes() + None + >>> d.nc_set_dataset_chunksizes('contiguous') + >>> d.nc_dataset_chunksizes() + 'contiguous' + >>> d.nc_set_dataset_chunksizes('1 KiB') + >>> d.nc_dataset_chunksizes() + 1024 + >>> d.nc_set_dataset_chunksizes(None) + >>> d.nc_dataset_chunksizes() + None + >>> d.nc_set_dataset_chunksizes([9999, -1, None]) + >>> d.nc_dataset_chunksizes() + (1, 96, 73) + >>> d.nc_clear_dataset_chunksizes() + (1, 96, 73) + >>> d.nc_set_dataset_chunksizes({1: 24}) + >>> d.nc_dataset_chunksizes() + (1, 24, 73) + >>> d.nc_set_dataset_chunksizes({0: None, 2: 50}) + >>> d.nc_dataset_chunksizes() + (1, 24, 50) + + """ + if shards is None: + self.nc_clear_dataset_shards() + return + + if isinstance(shards, Integral): + if shards < 1: + raise ValueError( + f"'shards' must be None, a positive integer, or a " + f"sequence positive of integers. Got {shards!r}" + ) + + self._set_netcdf("dataset_shards", shards) + return + + try: + shards = tuple(shards) + except TypeError: + raise ValueError( + f"'shards' must be None, a positive integer, or a " + f"sequence positive of integers. Got {shards!r}" + ) + + shape = self.shape + if len(shards) != len(shape): + raise ValueError( + f"When shards is a sequence {shards!r} then it must have the " + f"same length as the number of data dimensions ({len(shape)})" + ) + + for n, i in enumerate(shards): + if not (isinstance(i, Integral) and i > 0): + raise ValueError( + f"Shard size for dimension position {n} must be " + f"a positive integer. Got {i!r}" + ) + + self._set_netcdf("dataset_shards", shards) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 86b32d2f9..a3078f14d 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2,6 +2,8 @@ import logging import os import re +from math import prod +from numbers import Integral import dask.array as da import netCDF4 @@ -2601,9 +2603,8 @@ def _createVariable(self, **kwargs): if "dimensions" not in kwargs: netcdf4_kwargs["dimensions"] = () - if kwargs.get("contiguous") and g[ - "dataset" - ].data_model.startswith("NETCDF4"): + NETCDF4 = g["dataset"].data_model.startswith("NETCDF4") + if NETCDF4 and kwargs.get("contiguous"): # NETCDF4 contiguous variables can't be compressed kwargs["compression"] = None kwargs["complevel"] = 0 @@ -2624,7 +2625,7 @@ def _createVariable(self, **kwargs): f"{unlimited_dimensions}" ) - # Remove any Zarr-specific kwargs + # Remove Zarr-specific kwargs netcdf4_kwargs.pop("shape", None) netcdf4_kwargs.pop("shards", None) @@ -2632,10 +2633,32 @@ def _createVariable(self, **kwargs): case "zarr": shape = kwargs.get("shape", ()) - chunks = kwargs.get("chunksizes", "auto") - if chunks is None or not shape: + chunks = kwargs.get("chunksizes", shape) + shards = kwargs.get("shards") + + if chunks is None: chunks = shape + # Calculate the shard shape + if chunks: + if isinstance(shards, Integral): + n = int(shards ** (1 / len(chunks))) + if n > 1: + # More than one chunk per shard + shards = [c * n for c in chunks] + else: + # One chunk per shard + shards = None + elif shards and prod(shards) > 1: + # More than one chunk per shard + shards = [c * n for c, n in zip(chunks, shards)] + else: + # One chunk per shard + shards = None + else: + # One chunk per shard + shards = None + dtype = kwargs["datatype"] if dtype == "S1": dtype = str @@ -2645,7 +2668,7 @@ def _createVariable(self, **kwargs): "shape": shape, "dtype": dtype, "chunks": chunks, - "shards": kwargs.get("shards"), + "shards": shards, "fill_value": kwargs.get("fill_value"), "dimension_names": kwargs.get("dimensions", ()), "storage_options": g.get("storage_options"), @@ -2907,7 +2930,7 @@ def _write_netcdf_variable( if chunking: contiguous, chunksizes = chunking else: - contiguous, chunksizes = self._chunking_parameters( + contiguous, chunksizes, shards = self._chunking_parameters( data, ncdimensions ) @@ -2951,6 +2974,7 @@ def _write_netcdf_variable( "endian": g["endian"], "contiguous": contiguous, "chunksizes": chunksizes, + "shards": shards, "least_significant_digit": lsd, "fill_value": fill_value, "chunk_cache": g["chunk_cache"], @@ -3376,6 +3400,17 @@ def _write_data( meta=np.array((), dx.dtype), ) + # If a Zarr variable is sharded, then rechunk the Dask array + # to the shards, because "when writing data, a full shard must + # be written in one go for optimal performance and to avoid + # concurrency issues." + # (https://zarr.readthedocs.io/en/stable/user-guide/arrays.html). + if g["backend"] == "zarr": + shards = g["nc"][ncvar].shards + if shards is not None: + print(f"Zarr: rechunking to shards {shards} from {dx.chunks}") + dx = dx.rechunk(shards) + # Check for out-of-range values if g["warn_valid"]: if construct_type: @@ -3399,7 +3434,15 @@ def _write_data( fill_value=g["nc"][ncvar].fill_value, ) - da.store(dx, g["nc"][ncvar], compute=True, return_stored=False) + # try: + # except AttributeError: + # print ('chunks:', g["nc"][ncvar].chunking()) + + from ...data.locks import netcdf_lock as lock + + da.store( + dx, g["nc"][ncvar], compute=True, return_stored=False, lock=lock + ) def _filled_array(self, array, fill_value): """Replace masked values with a fill value. @@ -5218,7 +5261,7 @@ def write( The dataset chunking strategy. The default value is "4MiB". See `cfdm.write` for details. - dataset_shards: `str`, `int`, or `float`, optional + dataset_shards: `int` or `None`, optional The Zarr dataset sharding strategy. The default value is `None`. See `cfdm.write` for details. @@ -5390,11 +5433,9 @@ def write( # Parse the 'dataset_shards' parameter if dataset_shards is not None: - try: - self.write_vars["dataset_shards"] = parse_bytes(dataset_shards) - except (ValueError, AttributeError): + if not isinstance(dataset_shards, Integral) or dataset_shards < 1: raise ValueError( - "Invalid value for the 'dataset_shards' keyword: " + f"Invalid value for 'dataset_shards' keyword: " f"{dataset_shards!r}." ) @@ -5917,13 +5958,13 @@ def _chunking_parameters(self, data, ncdimensions): :Returns: - 2-tuple - The *contiguous* and *chunksizes* parameters for - `_createVariable`. + 3-tuple + The *contiguous*, *chunksizes*, and *shards* + parameters for `_createVariable`. """ if data is None: - return False, None + return False, None, None g = self.write_vars @@ -5933,24 +5974,29 @@ def _chunking_parameters(self, data, ncdimensions): # ------------------------------------------------------------ # Get the chunking strategy defined by the data itself chunksizes = self.implementation.nc_get_dataset_chunksizes(data) + shards = self.implementation.nc_get_dataset_shards(data) + if chunksizes == "contiguous": # Contiguous as defined by 'data' - return True, None + return True, None, None # Still here? + if shards is None: + shards = g["dataset_shards"] + dataset_chunks = g["dataset_chunks"] if isinstance(chunksizes, int): # Reset dataset chunks to the integer given by 'data' dataset_chunks = chunksizes elif chunksizes is not None: # Chunked as defined by the tuple of int given by 'data' - return False, chunksizes + return False, chunksizes, shards # Still here? Then work out the chunking strategy from the # dataset_chunks if dataset_chunks == "contiguous": # Contiguous as defined by 'dataset_chunks' - return True, None + return True, None, None # Still here? Then work out the chunks from both the # size-in-bytes given by dataset_chunks (e.g. 1024, or '1 @@ -5973,26 +6019,11 @@ def _chunking_parameters(self, data, ncdimensions): # (250, 250, 4)). However, we only want one number per # dimension, so we choose the largest: [96, 250]. chunksizes = [max(c) for c in chunksizes] - return False, chunksizes + return False, chunksizes, shards else: # The data is scalar, so 'chunksizes' is () => write the # data contiguously. - return True, None - - # def _shape_in_dataset(self, data, ncdimensions): - # """TODOZARR.""" - # if data is not None: - # # Get the shape from the data array - # if self._compressed_data(ncdimensions): - # d = self.implementation.get_compressed_array(data) - # else: - # d = data - # - # return d.shape - # - # # Still here? Then there's no data, so get the shape from the - # # netCDF dimensions - # return tuple([g['ncdim_to_size'][ncdim] for ncdim in ncdimensions]) + return True, None, None def _compressed_data(self, ncdimensions): """Whether or not the data is being written in compressed form. diff --git a/cfdm/test/test_Data.py b/cfdm/test/test_Data.py index 9047567e9..2749be5d7 100644 --- a/cfdm/test/test_Data.py +++ b/cfdm/test/test_Data.py @@ -2853,6 +2853,40 @@ def test_Data_zeros(self): self.assertEqual(d.dtype, dtype_out) self.assertTrue((d.array == np.zeros(shape, dtype=dtype_in)).all()) + def test_Data_dataset_shards(self): + """Test Data.nc_dataset_shards.""" + d = cfdm.Data(np.arange(24).reshape(2, 3, 4)) + + self.assertIsNone(d.nc_dataset_shards()) + self.assertIsNone(d.nc_set_dataset_shards([1, 2, 3])) + self.assertEqual(d.nc_dataset_shards(), (1, 2, 3)) + self.assertEqual(d.nc_clear_dataset_shards(), (1, 2, 3)) + self.assertIsNone(d.nc_dataset_shards()) + + self.assertIsNone(d.nc_set_dataset_shards(None)) + self.assertIsNone(d.nc_dataset_shards(None)) + + self.assertIsNone(d.nc_set_dataset_shards(100)) + self.assertEqual(d.nc_dataset_shards(), 100) + + # Check that shards get copied + self.assertEqual(d.copy().nc_dataset_shards(), 100) + + # Bad shards + for shards in ( + [2], + [-99, 3, 4], + [2, 3, 3.14], + ["bad", 3, 4], + [2, None, 4], + [2, 3, -1], + "bad", + -1, + 3.14, + ): + with self.assertRaises(ValueError): + d.nc_set_dataset_shards(shards) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index 44a905a54..dc5937436 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -8,6 +8,8 @@ faulthandler.enable() # to debug seg faults and timeouts +import zarr + import cfdm warnings = False @@ -43,39 +45,11 @@ def _remove_tmpdirs(): atexit.register(_remove_tmpdirs) -filename = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "test_file.nc" -) - class read_writeTest(unittest.TestCase): """Test the reading and writing of field constructs from/to disk.""" - filename = filename - - zarr2 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "example_field_0.zarr2" - ) - - zarr3 = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "example_field_0.zarr3" - ) - f0 = cfdm.example_field(0) - f1 = cfdm.example_field(1) - - string_filename = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "string_char.nc" - ) - - netcdf3_fmts = [ - "NETCDF3_CLASSIC", - "NETCDF3_64BIT", - "NETCDF3_64BIT_OFFSET", - "NETCDF3_64BIT_DATA", - ] - netcdf4_fmts = ["NETCDF4", "NETCDF4_CLASSIC"] - netcdf_fmts = netcdf3_fmts + netcdf4_fmts def setUp(self): """Preparations called immediately before each test method.""" @@ -128,6 +102,29 @@ def test_zarr_read_write_2(self): for a, b in zip(z, n): self.assertTrue(a.equals(b)) + def test_zarr_read_write_shards(self): + """Test Zarr read/write with shards.""" + f = self.f0.copy() + f.data.nc_set_dataset_chunksizes([2, 3]) + + cfdm.write(f, tmpdir1, fmt="ZARR3") + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertIsNone(z["q"].shards) + + # Make shards comprising 4 chunks + cfdm.write(f, tmpdir1, fmt="ZARR3", dataset_shards=4) + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + + for shards in (4, [2, 2]): + f.data.nc_set_dataset_shards(shards) + cfdm.write(f, tmpdir1, fmt="ZARR3") + z = zarr.open(tmpdir1) + self.assertEqual(z["q"].chunks, (2, 3)) + self.assertEqual(z["q"].shards, (4, 6)) + def test_zarr_read_write_CFA(self): """Test CF aggreagtion in Zarr.""" f = self.f0 From b0d2fc5a24e9377f88703545cdb93f72e2fc16b0 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 28 Aug 2025 23:31:48 +0100 Subject: [PATCH 09/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 411 ++++++++++++++++------ cfdm/read_write/netcdf/netcdfread.py | 392 ++++++++++++++------- cfdm/read_write/netcdf/netcdfwrite.py | 152 +++++--- cfdm/read_write/netcdf/zarr.py | 14 +- cfdm/test/test_groups.py | 1 + cfdm/test/test_zarr.py | 15 +- 6 files changed, 700 insertions(+), 285 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 1ca120f25..612548fa1 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -270,15 +270,62 @@ def __init__( See `netcdf_flatten`. """ + # Mapping of flattened attribute names to their full-path + # counterparts. + # + # E.g. ['Conventions: /Conventions'] self._attr_map_value = [] + + # Mapping of flattened dimension names to their full-path + # counterparts. + # + # E.g. ['bounds2: /bounds2', + # 'x: /x', + # 'forecast__y: /forecast/y'] self._dim_map_value = [] + + # Mapping of flattened variable names to their full-path + # counterparts. + # + # E.g. ['x_bnds: /x_bnds', + # 'x: /x', + # 'b_bounds: /b_bounds', + # 'b: /b', + # 'latitude_longitude: /latitude_longitude', + # 'forecast__y: /forecast/y'] self._var_map_value = [] + # Mapping of full-path dimension names to their flattened + # counterparts. + # + # E.g. {'/bounds2': 'bounds2', + # '/x': 'x', + # '/forecast/y': 'forecast__y'} self._dim_map = {} + + # Mapping of full-path variable names to their flattened + # counterparts. + # + # E.g. {'/x_bnds': 'x_bnds', + # '/x': 'x', + # '/b_bounds': 'b_bounds', + # '/b': 'b', + # '/latitude_longitude': 'latitude_longitude', + # '/forecast/y': 'forecast__y'} self._var_map = {} self._input_ds = input_ds self._output_ds = output_ds + + if hasattr(input_ds, "_h5file"): + self._input_ds_backend = "h5netcdf" + elif hasattr(input_ds, "data_model"): + self._input_ds_backend = "netCDF4" + elif hasattr(input_ds, "store"): + self._input_ds_backend = "zarr" + else: + raise ValueError("TODOZARR") + self._strict = bool(strict) self._omit_data = bool(omit_data) self._write_chunksize = write_chunksize @@ -293,7 +340,7 @@ def __init__( "be different, and output should be of the 'NETCDF4' format." ) - def attrs(self, variable): + def attrs(self, variable, backend=None): """Return the variable attributes. .. versionadded:: (cfdm) 1.11.2.0 @@ -311,14 +358,15 @@ def attrs(self, variable): names. """ - try: - # h5netcdf - return dict(variable.attrs) - except AttributeError: - # netCDF4 - return { - attr: variable.getncattr(attr) for attr in variable.ncattrs() - } + match self._backend(backend): + case "netCDF4": + return { + attr: variable.getncattr(attr) + for attr in variable.ncattrs() + } + + case "h5netcdf" | "zarr": + return dict(variable.attrs) def chunksizes(self, variable): """Return the variable chunk sizes. @@ -346,16 +394,16 @@ def chunksizes(self, variable): None """ - try: - # netCDF4 - chunking = variable.chunking() - if chunking == "contiguous": - return None + match self._backend(): + case "h5netcdf" | "zarr": + return variable.chunks + + case "netCDF4": + chunking = variable.chunking() + if chunking == "contiguous": + return None - return chunking - except AttributeError: - # h5netcdf - return variable.chunks + return chunking def contiguous(self, variable): """Whether or not the variable data is contiguous on disk. @@ -380,12 +428,12 @@ def contiguous(self, variable): False """ - try: - # netCDF4 - return variable.chunking() == "contiguous" - except AttributeError: - # h5netcdf - return variable.chunks is None + match self._backend(): + case "h5netcdf" | "zarr": + return variable.chunks is None + + case "netCDF4": + return variable.chunking() == "contiguous" def dtype(self, variable): """Return the data type of a variable. @@ -412,8 +460,10 @@ def dtype(self, variable): str """ + from numpy.dtypes import StringDType + out = variable.dtype - if out == "O": + if out in ("O", StringDType()): out = str return out @@ -441,13 +491,13 @@ def endian(self, variable): 'native' """ - try: - # netCDF4 - return variable.endian() - except AttributeError: - # h5netcdf - dtype = variable.dtype - return _dtype_endian_lookup[getattr(dtype, "byteorder", None)] + match self._backend(): + case "h5netcdf" | "zarr": + dtype = variable.dtype + return _dtype_endian_lookup[getattr(dtype, "byteorder", None)] + + case "netCDF4": + return variable.endian() def filepath(self, dataset): """Return the file path for the dataset. @@ -472,12 +522,15 @@ def filepath(self, dataset): '/home/data/file.nc' """ - try: - # netCDF4 - return dataset.filepath() - except AttributeError: - # h5netcdf - return dataset.filename + match self._backend(): + case "h5netcdf": + return dataset.filename + + case "netCDF4": + return dataset.filepath() + + case "zarr": + return str(dataset.store_path) def get_dims(self, variable): """Return the dimensions associated with a variable. @@ -489,29 +542,32 @@ def get_dims(self, variable): `list` """ - try: - # netCDF4 - return variable.get_dims() - except AttributeError: - # h5netcdf - dims = {} - dimension_names = list(variable.dimensions) - group = variable._parent - for name, dim in group.dims.items(): - if name in dimension_names: - dims[name] = dim - dimension_names.remove(name) - - group = group.parent - while group is not None and dimension_names: + match self._backend(): + case "netCDF4": + return variable.get_dims() + + case "h5netcdf": + dims = {} + dimension_names = list(variable.dimensions) + group = variable._parent for name, dim in group.dims.items(): if name in dimension_names: dims[name] = dim dimension_names.remove(name) group = group.parent + while group is not None and dimension_names: + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim + dimension_names.remove(name) + + group = group.parent - return [dims[name] for name in variable.dimensions] + return [dims[name] for name in variable.dimensions] + + case "zarr": + return tuple(self._zarr_var_to_dims[variable.path]) def getncattr(self, x, attr): """Retrieve a netCDF attribute. @@ -527,15 +583,15 @@ def getncattr(self, x, attr): :Returns: """ - try: - # netCDF4 - return getattr(x, attr) - except AttributeError: - # h5netcdf - return x.attrs[attr] + match self._backend(): + case "h5netcdf" | "zarr": + return x.attrs[attr] + + case "netCDF4": + return getattr(x, attr) def group(self, x): - """Return the group that a variable belongs to. + """Return the group that a variable or dimension belongs to. .. versionadded:: (cfdm) 1.11.2.0 @@ -544,14 +600,25 @@ def group(self, x): `Group` """ - try: - # netCDF4 - return x.group() - except AttributeError: - # h5netcdf - return x._parent + match self._backend(): + case "netCDF4": + return x.group() + + case "h5netcdf": + return x._parent + + case "zarr": + try: + # Variable + group_name = group_separator.join( + x.path.split(group_separator)[:-1] + ) + return self._input_ds[group_name] + except AttributeError: + # Dimension + return x.group() - def name(self, x): + def name(self, x, backend=None): """Return the netCDF name, without its groups. .. versionadded:: (cfdm) 1.11.2.0 @@ -561,12 +628,17 @@ def name(self, x): `str` """ - out = x.name - if group_separator in out: - # h5netcdf - out = x.name.split(group_separator)[-1] + match self._backend(backend): + case "h5netcdf" | "netCDF4": + return x.name.split(group_separator)[-1] - return out + case "zarr": + try: + # Variable + return x.path.split(group_separator)[-1] + except AttributeError: + # Dimension + return x.name.split(group_separator)[-1] def ncattrs(self, x): """Return netCDF attribute names. @@ -582,12 +654,12 @@ def ncattrs(self, x): `list` """ - try: - # netCDF4 - return x.ncattrs() - except AttributeError: - # h5netcdf - return list(x.attrs) + match self._backend(): + case "h5netcdf" | "zarr": + return list(x.attrs) + + case "netCDF4": + return x.ncattrs() def parent(self, group): """Return a simulated unix parent group. @@ -596,13 +668,26 @@ def parent(self, group): :Returns: - `str` + `Group` or `None` + The parent grup, or `None` if *group* is the root + group (and so has no parent). """ - try: - return group.parent - except AttributeError: - return + match self._backend(): + case "h5netcdf" | "netCDF4": + try: + return group.parent + except AttributeError: + return + + case "zarr": + name = group.name + if name == group_separator: + return + + return self._input_ds[ + group_separator.join(name.split(group_separator)[:-1]) + ] def path(self, group): """Return a simulated unix directory path to a group. @@ -614,15 +699,15 @@ def path(self, group): `str` """ - try: - # netCDF4 - return group.path - except AttributeError: - # h5netcdf - try: - return group.name - except AttributeError: - return group_separator + match self._backend(): + case "h5netcdf" | "zarr": + try: + return group.name + except AttributeError: + return group_separator + + case "netCDF4": + return group.path def flatten(self): """Flattens and writes to output file. @@ -675,13 +760,22 @@ def process_group(self, input_group): for attr_name in self.ncattrs(input_group): self.flatten_attribute(input_group, attr_name) - for dim in input_group.dimensions.values(): + # for dim in input_group.dimensions.values(): + # self.flatten_dimension(dim) + + for dim in self._dimensions(input_group).values(): self.flatten_dimension(dim) - for var in input_group.variables.values(): + # for var in input_group.variables.values(): + # self.flatten_variable(var) + + for var in self._variables(input_group).values(): self.flatten_variable(var) - for child_group in input_group.groups.values(): + # for child_group in input_group.groups.values(): + # self.process_group(child_group) + + for child_group in self._child_groups(input_group).values(): self.process_group(child_group) def flatten_attribute(self, input_group, attr_name): @@ -747,6 +841,7 @@ def flatten_dimension(self, dim): ) # Write dimension + # print ('creating dimension:', new_name,) # '(org in', self.group(dim)) self._output_ds.createDimension( new_name, (len(dim), None)[dim.isunlimited()] ) @@ -811,6 +906,7 @@ def flatten_variable(self, var): else: fill_value = attributes.pop("_FillValue", None) + # print ('creating variable:', new_name)# '(org in', self.group(var)) new_var = self._output_ds.createVariable( new_name, self.dtype(var), @@ -1249,6 +1345,7 @@ def search_by_relative_path(self, ref, current_group, search_dim): if search_dim: elt = current_group.dimensions[ref_split[-1]] else: + # elt = current_group.variables[ref_split[-1]] elt = current_group.variables[ref_split[-1]] # Get absolute reference @@ -1299,31 +1396,39 @@ def search_by_proximity( """ if search_dim: - dims_or_vars = current_group.dimensions + # dims_or_vars = current_group.dimensions # TODOZARR + dims_or_vars = self._dimensions(current_group) else: - dims_or_vars = current_group.variables + # dims_or_vars = current_group.variables + dims_or_vars = self._variables(current_group) # Found in current group if ref in dims_or_vars.keys(): return dims_or_vars[ref] local_apex_reached = ( - local_apex_reached or ref in current_group.dimensions.keys() + # local_apex_reached or ref in current_group.dimensions.keys() + local_apex_reached + or ref in self._dimensions(current_group).keys() ) # Check if have to continue looking in parent group # - normal search: continue until root is reached # - coordinate variable: continue until local apex is reached + parent_group = self.parent(current_group) if is_coordinate_variable: - top_reached = local_apex_reached or current_group.parent is None + # top_reached = local_apex_reached or current_group.parent is None + top_reached = local_apex_reached or parent_group is None else: - top_reached = current_group.parent is None + # top_reached = current_group.parent is None + top_reached = parent_group is None # Search up if not top_reached: return self.search_by_proximity( ref, - current_group.parent, + # current_group.parent, + parent_group, search_dim, local_apex_reached, is_coordinate_variable, @@ -1375,7 +1480,7 @@ def resolve_references(self, var, old_var): `None` """ - var_attrs = self.attrs(var) + var_attrs = self.attrs(var, "netCDF4") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value parsed_attribute = parse_attribute(name, var_attrs[name]) @@ -1423,7 +1528,7 @@ def adapt_references(self, var): `None` """ - var_attrs = self.attrs(var) + var_attrs = self.attrs(var, "netCDF4") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value value = var_attrs[name] @@ -1448,8 +1553,8 @@ def adapt_references(self, var): var.setncattr(name, new_attr_value) logging.info( - f" Value of {self.name(var)}.{name} changed " - f"from {value!r} to {new_attr_value!r}" + f" Value of {self.name(var, 'netCDF4')}.{name} " + f"changed from {value!r} to {new_attr_value!r}" ) def adapt_name(self, resolved_ref, rules): @@ -1679,6 +1784,100 @@ def handle_reference_error(self, ref, context=None): warnings.warn(message) return f"{ref_not_found_error}_{ref}" + def _dimensions(self, group): + """Return dimensions that are defined in this group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + + :Returns: + + + """ + match self._backend(): + case "h5netcdf" | "netCDF4": + return group.dimensions + + case "zarr": + from ..zarr import ZarrDimension + + # print(group) + if not hasattr(self, "_zarr_dims"): + # Cache the ZarrDimension objects, keyed by + # dimension basename. + self._zarr_dims = {} + + if not hasattr(self, "_zarr_var_to_dims"): + # Cache each variable's ZarrDimension objects, + # keyed by the full-path variable name. + self._zarr_var_to_dims = {} + + dimensions = {} + for v in group.array_values(): + dimension_names = v.metadata.dimension_names + if dimension_names is None: + # Scalar variable + continue + + for name, size in zip(dimension_names, v.shape): + if name in self._zarr_dims: + continue + + basename = name.split(group_separator)[-1] + if basename in dimensions: + continue + + zd = ZarrDimension(basename, size, group) + dimensions[basename] = zd + self._zarr_dims[name] = zd + + self._zarr_dims.update(dimensions) + + # print(' dimensions =',dimensions) + # print(' self._zarr_dims =',tuple(self._zarr_dims)) + + # Map zarr variables to their dimension objects + for v in group.array_values(): + dimension_names = v.metadata.dimension_names + if dimension_names is None: + # Scalar variable + dimension_names = () + + self._zarr_var_to_dims[v.path] = [ + self._zarr_dims[name] for name in dimension_names + ] + + # print(' self._zarr_var_to_dims=',tuple(self._zarr_var_to_dims)) + + return dimensions + + def _variables(self, group): + """Return variables that are defined in this group.""" + match self._backend(): + case "h5netcdf" | "netCDF4": + return group.variables + + case "zarr": + return dict(group.arrays()) + + def _child_groups(self, group): + """Return groups that are defined in this group.""" + match self._backend(): + case "h5netcdf" | "netCDF4": + return group.groups + + case "zarr": + return dict(group.groups()) + + def _backend(self, name=None): + if name is None: + return self._input_ds_backend + + return name + class AttributeParsingException(Exception): """Exception for unparsable attribute. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index db7238087..ca6c7f480 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -472,8 +472,8 @@ def dataset_close(self): for flat_dataset in g["flat_datasets"]: flat_dataset.close() - if g["dataset_opened_with"] == "zarr": - # zarr + if g["original_dataset_opened_with"] == "zarr": + # zarr: No need to close return # netCDF4, h5netcdf @@ -628,7 +628,7 @@ def dataset_open(self, dataset, flatten=True, verbose=None): g["flat_datasets"].append(flat_dataset) g["nc_opened_with"] = "netCDF4" else: - g["nc_opened_with"] = g["dataset_opened_with"] + g["nc_opened_with"] = g["original_dataset_opened_with"] g["nc"] = nc return nc @@ -649,7 +649,7 @@ def _open_netCDF4(self, filename): """ nc = netCDF4.Dataset(filename, "r") - self.read_vars["dataset_opened_with"] = "netCDF4" + self.read_vars["original_dataset_opened_with"] = "netCDF4" return nc def _open_h5netcdf(self, filename): @@ -683,7 +683,7 @@ def _open_h5netcdf(self, filename): rdcc_w0=0.75, rdcc_nslots=4133, ) - self.read_vars["dataset_opened_with"] = "h5netcdf" + self.read_vars["original_dataset_opened_with"] = "h5netcdf" return nc def _open_zarr(self, dataset): @@ -708,7 +708,7 @@ def _open_zarr(self, dataset): raise nc = zarr.open(dataset, mode="r") - self.read_vars["dataset_opened_with"] = "zarr" + self.read_vars["original_dataset_opened_with"] = "zarr" return nc def cdl_to_netcdf(self, filename): @@ -6639,7 +6639,9 @@ def _create_netcdfarray( group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - variable = group.variables.get(name) + # variable = group.variables.get(name) + variable = self._file_group_variables(group).get(name) + else: variable = g["variables"].get(ncvar) @@ -6658,7 +6660,8 @@ def _create_netcdfarray( ndim = variable.ndim shape = variable.shape - size = self._file_variable_size(variable) + # size = self._file_variable_size(variable) + size = prod(shape) if size < 2: size = int(size) @@ -6713,7 +6716,7 @@ def _create_netcdfarray( # elif file_opened_with == "zarr": # array = self.implementation.initialise_ZarrArray(**kwargs) - match g["dataset_opened_with"]: + match g["original_dataset_opened_with"]: case "netCDF4": array = self.implementation.initialise_NetCDF4Array( **kwargs @@ -8179,7 +8182,9 @@ def _create_Data( """ g = self.read_vars - match g["nc_opened_with"]: + + # Deal with strings + match g["original_dataset_opened_with"]: case "zarr": if array.dtype == np.dtypes.StringDType(): array = array.astype("O", copy=False).astype( @@ -8188,6 +8193,7 @@ def _create_Data( array = np.ma.masked_values(array, "") case _: + # h5netcdf | netCDF4 if array.dtype is None: if g["has_groups"]: group, name = self._netCDF4_group( @@ -9576,7 +9582,7 @@ def _netCDF4_group(self, nc, name): :Returns: - `netCDF4._netCDF4.Dataset` or `netCDF4._netCDF4.Group`, `str` + (`netCDF4._netCDF4.Dataset` or `netCDF4._netCDF4.Group`, `str`) **Examples** @@ -10870,9 +10876,6 @@ def _ugrid_check_connectivity_variable( def _dataset_has_groups(self, nc): """True if the dataset has a groups other than the root group. - If the dataset is a Zarr dataset then an exception is raised - of the dataset has groups. - .. versionadded:: (cfdm) 1.12.2.0 :Parameters: @@ -10885,18 +10888,27 @@ def _dataset_has_groups(self, nc): `bool` """ - if self.read_vars["dataset_opened_with"] == "zarr": - # zarr - if len(tuple(nc.groups())) > 1: - raise ReadError( - "Can't read Zarr dataset that has groups: " - f"{self.read_vars['dataset']}" - ) - - return False + match self.read_vars["original_dataset_opened_with"]: + case "h5netcdf" | "netCDF4": + return bool(nc.groups) - # netCDF4, h5netcdf - return bool(nc.groups) + case "zarr": + return bool(tuple(nc.groups())) + + # if self.read_vars["dataset_opened_with"] == "zarr": + # return bool(tuple(nc.groups())) + # # zarr + # #if len(tuple(nc.groups())) > 1: + # #if tuple(nc.groups()): + # # raise ReadError( + # # "Can't read Zarr dataset that has groups: " + # # f"{self.read_vars['dataset']}" + # # ) + # # + # #return False + # + # # netCDF4, h5netcdf + # return bool(nc.groups) def _file_global_attribute(self, nc, attr): """Return a global attribute from a dataset. @@ -10916,12 +10928,19 @@ def _file_global_attribute(self, nc, attr): The global attribute value. """ - try: - # netCDF4 - return nc.getncattr(attr) - except AttributeError: - # h5netcdf, zarr - return nc.attrs[attr] + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "zarr": + return nc.attrs[attr] + + case "netCDF4": + return nc.getncattr(attr) + + # try: + # # netCDF4 + # return nc.getncattr(attr) + # except AttributeError: + # # h5netcdf, zarr + # return nc.attrs[attr] def _file_global_attributes(self, nc): """Return the global attributes from a dataset. @@ -10931,7 +10950,8 @@ def _file_global_attributes(self, nc): :Parameters: nc: `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. :Returns: @@ -10940,42 +10960,102 @@ def _file_global_attributes(self, nc): names. """ - try: - # h5netcdf, zarr - return nc.attrs - except AttributeError: - # netCDF4 - return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "zarr": + return nc.attrs + + case "netCDF4": + return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} + + # try: + # # h5netcdf, zarr + # return nc.attrs + # except AttributeError: + # # netCDF4 + # return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} + + def _file_group_variables(self, group): + """Return all variables in a group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group. + + :Returns: + + `dict`-like + A dictionary of the variables keyed by their names. + + """ + match self.read_vars["original_dataset_opened_with"]: + case "h5netcdf" | "netCDF4": + return group.variables + + case "zarr": + return dict(group.arrays()) def _file_dimensions(self, nc): """Return all dimensions in the root group. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + nc: `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. + :Returns: `dict`-like A dictionary of the dimensions keyed by their names. """ - try: - # netCDF4, h5netcdf - return nc.dimensions - except AttributeError: - # zarr - dimensions = {} - for var in self._file_variables(nc).values(): - dimensions.update( - { - name: ZarrDimension(name, size, nc) - for name, size in zip( - self._file_variable_dimensions(var), var.shape - ) - if name not in dimensions - } - ) +# if hasattr(self, "_cached_file_dimensions"): +# return self._cached_file_dimensions - return dimensions + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + dimensions = dict(nc.dimensions) + + case "zarr": + dimensions = {} + for var in self._file_variables(nc).values(): + dimensions.update( + { + name: ZarrDimension(name, size, nc) + for name, size in zip( + self._file_variable_dimensions(var), var.shape + ) + if name not in dimensions + } + ) + + # self._cached_file_dimensions = dimensions + + return dimensions + + # try: + # # netCDF4, h5netcdf + # return nc.dimensions + # except AttributeError: + # # zarr + # dimensions = {} + # for var in self._file_variables(nc).values(): + # dimensions.update( + # { + # name: ZarrDimension(name, size, nc) + # for name, size in zip( + # self._file_variable_dimensions(var), var.shape + # ) + # if name not in dimensions + # } + # ) + # + # return dimensions def _file_dimension(self, nc, dim_name): """Return a dimension from the root group of a dataset. @@ -10985,14 +11065,15 @@ def _file_dimension(self, nc, dim_name): :Parameters: nc: `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. dim_name: `str` The dimension name. :Returns: - `netCDF.Dimension` or `h5netcdf.Dimension` + `netCDF.Dimension` or `h5netcdf.Dimension` or `ZarrDimension` The dimension. """ @@ -11000,14 +11081,15 @@ def _file_dimension(self, nc, dim_name): return self._file_dimensions(nc)[dim_name] def _file_dimension_isunlimited(self, nc, dim_name): - """Return whether a dimension is unlimited. + """Return whether a dimension in the root group is unlimited. .. versionadded:: (cfdm) 1.11.2.0 :Parameters: nc: `netCDF4.Dataset` or `h5netcdf.File` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. dim_name: `str` The dimension name. @@ -11018,12 +11100,19 @@ def _file_dimension_isunlimited(self, nc, dim_name): Whether the dimension is unlimited. """ - try: - # netCDF4, h5netcdf - return self._file_dimension(nc, dim_name).isunlimited() - except Exception: - # zarr - return False + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + return self._file_dimension(nc, dim_name).isunlimited() + + case "zarr": + return False + + # try: + # # netCDF4, h5netcdf + # return self._file_dimension(nc, dim_name).isunlimited() + # except Exception: + # # zarr + # return False def _file_dimension_size(self, nc, dim_name): """Return a dimension's size. @@ -11055,7 +11144,8 @@ def _file_variables(self, nc): :Parameters: nc: `netCDF4.Dataset`, `h5netcdf.File` or `zarr.Group` - The dataset. + The dataset. If the original dataset has groups, then + *nc* is the flattened dataset. :Returns: @@ -11063,12 +11153,19 @@ def _file_variables(self, nc): A dictionary of the variables keyed by their names. """ - try: - # netCDF4, h5netcdf - return nc.variables - except AttributeError: - # zarr - return dict(nc.arrays()) + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + return nc.variables + + case "zarr": + return dict(nc.arrays()) + + # try: + # # netCDF4, h5netcdf + # return nc.variables + # except AttributeError: + # # zarr + # return dict(nc.arrays()) def _file_variable(self, nc, var_name): """Return a variable. @@ -11100,7 +11197,8 @@ def _file_variable_attributes(self, var): :Parameters: var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` - The variable. + The variable. If the original dataset has groups, then + *var* is from the flattened dataset. :Returns: @@ -11109,28 +11207,44 @@ def _file_variable_attributes(self, var): names. """ - try: - # h5netcdf, zarr - attrs = dict(var.attrs) - except AttributeError: - # netCDF4 - return {attr: var.getncattr(attr) for attr in var.ncattrs()} - else: - if self.read_vars["dataset_opened_with"] == "zarr": - # zarr: Remove the _ARRAY_DIMENSIONS attribute - attrs.pop("_ARRAY_DIMENSIONS", None) + match self.read_vars["nc_opened_with"]: + case "h5netcdf": + return dict(var.attrs) - return attrs + case "netCDF4": + return {attr: var.getncattr(attr) for attr in var.ncattrs()} + + case "zarr": + attrs = dict(var.attrs) + if self.read_vars["original_dataset_opened_with"] == "zarr": + # zarr: Remove the _ARRAY_DIMENSIONS attribute + attrs.pop("_ARRAY_DIMENSIONS", None) # TODOZARR + + return attrs + + # try: + # # h5netcdf, zarr + # attrs = dict(var.attrs) + # except AttributeError: + # # netCDF4 + # return {attr: var.getncattr(attr) for attr in var.ncattrs()} + # else: + # if self.read_vars["dataset_opened_with"] == "zarr": + # # zarr: Remove the _ARRAY_DIMENSIONS attribute + # attrs.pop("_ARRAY_DIMENSIONS", None) + # + # return attrs def _file_variable_dimensions(self, var): """Return the variable dimension names. - .. versionadded:: (cfdm) 1.12.2.0 + .. versionadded:: (cfdm) 1.12.2.0 :Parameters: - var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` - The variable. + var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` + The variable. If the original dataset has groups, then + *var* is from the flattened dataset. :Returns: @@ -11138,47 +11252,70 @@ def _file_variable_dimensions(self, var): The dimension names. """ - try: - # netCDF4, h5netcdf - return var.dimensions - except AttributeError: - try: - # zarr v3 - dimension_names = var.metadata.dimension_names - if dimension_names is None: - # scalar variable - dimension_names = () - - return dimension_names - except AttributeError: - # zarr v2 - return tuple(var.attrs["_ARRAY_DIMENSIONS"]) - - def _file_variable_size(self, var): - """Return the size of a variable's array. - - .. versionadded:: (cfdm) 1.11.2.0 - - :Parameters: + match self.read_vars["nc_opened_with"]: + case "h5netcdf" | "netCDF4": + return var.dimensions - var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` - The variable. - - :Returns: - - `int` - The array size. + case "zarr": + try: + # Zarr v3 + dimension_names = var.metadata.dimension_names + if dimension_names is None: + # Scalar variable + dimension_names = () + + return dimension_names + except AttributeError: + # Zarr v2 + return tuple(var.attrs["_ARRAY_DIMENSIONS"]) + + # try: + # # netCDF4, h5netcdf + # return var.dimensions + # except AttributeError: + # try: + # # zarr v3 + # dimension_names = var.metadata.dimension_names + # if dimension_names is None: + # # scalar variable + # dimension_names = () + # + # return dimension_names + # except AttributeError: + # # zarr v2 + # return tuple(var.attrs["_ARRAY_DIMENSIONS"]) - """ - # Use try/except here because the variable type could differ - # from that implied by the value of - # read_vars["dataset_opened_with"] - try: - # netCDF4, zarr - return var.size - except AttributeError: - # h5netcdf - return prod(var.shape) + # def _file_variable_size(self, var): + # """Return the size of a variable's array. + # + # .. versionadded:: (cfdm) 1.11.2.0 + # + # :Parameters: + # + # var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` + # The variable. + # + # :Returns: + # + # `int` + # The array size. + # + # """ + # match self.read_vars["dataset_opened_with"]: + # case 'netCDF4'|'zarr': + # return var.size + # + # case 'h5netcdf': + # return prod(var.shape) + # # Use try/except here because the variable type could differ + # # from that implied by the value of + # # read_vars["dataset_opened_with"] + # try: + # # netCDF4, zarr + # return var.size + # except AttributeError: + # # h5netcdf + # return prod(var.shape) def _get_storage_options(self, dataset, parsed_dataset): """Get the storage options for accessing a file. @@ -11656,7 +11793,8 @@ def _cache_data_elements(self, data, ncvar, attributes=None): group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - variable = group.variables.get(name) + # variable = group.variables.get(name) + variable = self._file_group_variables(group).get(name) else: variable = g["variables"].get(ncvar) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index a3078f14d..75d607435 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -99,12 +99,15 @@ def _createGroup(self, parent, group_name): The new group object. """ - match self.write_vars["backend"]: + g = self.write_vars + match g["backend"]: case "netCDF4": return parent.createGroup(group_name) case "zarr": - return parent.create_group(group_name) + return parent.create_group( + group_name, overwrite=g["overwrite"] + ) case _: raise ValueError( @@ -702,7 +705,7 @@ def _write_dimension( # this dataset dimension. parent_group = self._parent_group(ncdim) - if g["group"] and "/" in ncdim: + if g["group"] and "/" in ncdim and g["backend"] != "zarr": # This dimension needs to go into a sub-group so replace # its name with its basename (CF>=1.8) ncdim = self._remove_group_structure(ncdim) @@ -2639,25 +2642,34 @@ def _createVariable(self, **kwargs): if chunks is None: chunks = shape - # Calculate the shard shape - if chunks: - if isinstance(shards, Integral): - n = int(shards ** (1 / len(chunks))) - if n > 1: - # More than one chunk per shard - shards = [c * n for c in chunks] + if shards is not None: + # Calculate the shard shape in the format expected + # by `zarr.create_array`, i.e. shards are defined + # by how many array elements along each dimension + # are in each shard. + if chunks == shape: + # One chunk per shard. + # + # It doesn't matter what 'shards' is, because + # the data only has one chunk. + shards = None + else: + ndim = len(chunks) + if isinstance(shards, Integral): + n = int(shards ** (1 / ndim)) + shards = (n,) * ndim + + if prod(shards) > 1: + # More than one chunk per shard. + # + # E.g. shards=(10, 11, 12), chunks=(10, 20, + # 30) => shards=(100, 220, 360) + shards = [c * n for c, n in zip(chunks, shards)] else: - # One chunk per shard + # One chunk per shard. + # + # E.g. shards=(1, 1, 1) => shards=None shards = None - elif shards and prod(shards) > 1: - # More than one chunk per shard - shards = [c * n for c, n in zip(chunks, shards)] - else: - # One chunk per shard - shards = None - else: - # One chunk per shard - shards = None dtype = kwargs["datatype"] if dtype == "S1": @@ -2837,11 +2849,11 @@ def _write_netcdf_variable( .. versionadded:: (cfdm) 1.10.1.0 - chunking: sequence of `int`, optional - Set `netCDF4.createVariable` 'contiguous' and - `chunksizes` parameters (in that order). If not set - (the default), then these parameters are inferred from - the data. + chunking: sequence, optional + Set `_createVariable` 'contiguous', 'chunksizes', and + 'shards' parameters (in that order). If `None` (the + default), then these parameters are inferred from the + data. .. versionadded:: (cfdm) 1.12.0.0 @@ -2928,7 +2940,7 @@ def _write_netcdf_variable( # Set the dataset chunk strategy if chunking: - contiguous, chunksizes = chunking + contiguous, chunksizes, shards = chunking else: contiguous, chunksizes, shards = self._chunking_parameters( data, ncdimensions @@ -2954,23 +2966,12 @@ def _write_netcdf_variable( "not in the same group nor in a parent group." ) - # ------------------------------------------------------------ - # Replace dataset dimension names with their basenames - # (CF>=1.8) - # ------------------------------------------------------------ - ncdimensions_basename = [ - self._remove_group_structure(ncdim) for ncdim in ncdimensions - ] - - # Get shape of arra - # ------------------------------------------------------------ # Create a new dataset variable # ------------------------------------------------------------ kwargs = { "varname": ncvar, "datatype": datatype, - "dimensions": ncdimensions_basename, "endian": g["endian"], "contiguous": contiguous, "chunksizes": chunksizes, @@ -2980,6 +2981,22 @@ def _write_netcdf_variable( "chunk_cache": g["chunk_cache"], } + # ------------------------------------------------------------ + # Replace dataset dimension names with their basenames + # (CF>=1.8) + # ------------------------------------------------------------ + if g["backend"] == "zarr": + # ... but not for Zarr. This is because Zarr doesn't have + # the concept of dimensions belonging to a group (unlike + # netCDF), so by keeping the group structure in the + # dimension names we can know which group they belong to. + kwargs["dimensions"] = ncdimensions + else: + ncdimensions_basename = [ + self._remove_group_structure(ncdim) for ncdim in ncdimensions + ] + kwargs["dimensions"] = ncdimensions_basename + if data is not None: compressed = self._compressed_data(ncdimensions) if compressed: @@ -4664,12 +4681,14 @@ def _write_group_attributes(self, fields): f0, attr ) - nc = g["dataset"] # TODOZARR - for group in groups: - if group in nc.groups: - nc = nc.groups[group] - else: - nc = self._createGroup(nc, group) + # nc = g["dataset"] # TODOZARR + nc = self._get_group(g["dataset"], groups) + # for group in groups: + # print (' nc.groups=', repr(nc.groups)) + # if group in nc.groups: + # nc = nc.groups[group] + # else: + # nc = self._createGroup(nc, group) if not g["dry_run"]: # nc.setncatts(this_group_attributes) @@ -4679,6 +4698,45 @@ def _write_group_attributes(self, fields): g["group_attributes"] = group_attributes + def _get_group(self, parent, groups): + """Get the group of *nc* defined by *groups*. + + The group will be created if it doesn't already exist. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + parent: `netCDF4.Dateset` or `netCDF4.Group` or `Zarr.Group` + The group in which to find or create new group. + + groups: sequence of `str` + The group defined by the sequence of its subgroups + realtive to *parent*, e.g. ``('forecast', 'model')``. + + :Returns: + + `netCDF4.Group` or `Zarr.Group` + The group. + + """ + match self.write_vars["backend"]: + case "netCDF4": + for group in groups: + if group in parent.groups: + parent = parent.groups[group] + else: + parent = self._createGroup(parent, group) + + case "zarr": + group = "/".join(groups) + if group in parent: + parent = parent[group] + else: + parent = self._createGroup(parent, group) + + return parent + def _write_global_attributes(self, fields): """Writes all global properties to the dataset. @@ -5959,7 +6017,7 @@ def _chunking_parameters(self, data, ncdimensions): :Returns: 3-tuple - The *contiguous*, *chunksizes*, and *shards* + The 'contiguous', 'chunksizes', and 'shards' parameters for `_createVariable`. """ @@ -6400,9 +6458,9 @@ def _cfa_write_fragment_array_variable( Any attributes to attach to the variable. chunking: sequence, optional - Set `_createVariable` 'contiguous' and `chunksizes` - parameters (in that order) for the fragment array - variable. If not set (the default), then these + Set `_createVariable` 'contiguous', 'chunksizes', and + 'shards' parameters (in that order) for the fragment + array variable. If `None` (the default), then these parameters are inferred from the data. :Returns: diff --git a/cfdm/read_write/netcdf/zarr.py b/cfdm/read_write/netcdf/zarr.py index 564bd7d68..4778d7e5e 100644 --- a/cfdm/read_write/netcdf/zarr.py +++ b/cfdm/read_write/netcdf/zarr.py @@ -25,7 +25,7 @@ def __init__(self, name, size, group): """ self.name = name self.size = size - self.group = group + self._group = group def __len__(self): """The size of the dimension. @@ -37,6 +37,16 @@ def __len__(self): """ return self.size + def __repr__(self): + """The size of the dimension. + + x.__len__() <==> len(x) + + .. versionadded:: (cfdm) 1.12.2.0 + + """ + return f"" + def group(self): """Return the group that the dimension is a member of. @@ -48,7 +58,7 @@ def group(self): The group containing the dimension. """ - return self.group + return self._group def isunlimited(self): """Whether or not the dimension is unlimited. diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index 5bb7b2fb4..5fd99f608 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -171,6 +171,7 @@ def test_groups(self): # ------------------------------------------------------------ name = "grid_latitude" g.construct(name).bounds.nc_set_variable_groups(["forecast"]) + grouped_file = "grouped_file.nc" cfdm.write(g, grouped_file) nc = netCDF4.Dataset(grouped_file, "r") diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index dc5937436..e88b4ae8c 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -83,7 +83,7 @@ def test_zarr_read_write_1(self): self.assertTrue(z.equals(n)) def test_zarr_read_write_2(self): - """Test Zarr read/write on test netCDF files.""" + """Test Zarr read/write on various netCDF files.""" for filename in ( "DSG_timeSeries_contiguous.nc", "DSG_timeSeries_indexed.nc", @@ -102,18 +102,24 @@ def test_zarr_read_write_2(self): for a, b in zip(z, n): self.assertTrue(a.equals(b)) - def test_zarr_read_write_shards(self): - """Test Zarr read/write with shards.""" + def test_zarr_read_write_chunks_shards(self): + """Test Zarr read/write with chunks and shards.""" f = self.f0.copy() f.data.nc_set_dataset_chunksizes([2, 3]) cfdm.write(f, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + z = zarr.open(tmpdir1) self.assertEqual(z["q"].chunks, (2, 3)) self.assertIsNone(z["q"].shards) # Make shards comprising 4 chunks cfdm.write(f, tmpdir1, fmt="ZARR3", dataset_shards=4) + z = cfdm.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + z = zarr.open(tmpdir1) self.assertEqual(z["q"].chunks, (2, 3)) self.assertEqual(z["q"].shards, (4, 6)) @@ -121,6 +127,9 @@ def test_zarr_read_write_shards(self): for shards in (4, [2, 2]): f.data.nc_set_dataset_shards(shards) cfdm.write(f, tmpdir1, fmt="ZARR3") + z = cfdm.read(tmpdir1)[0] + self.assertTrue(z.equals(f)) + z = zarr.open(tmpdir1) self.assertEqual(z["q"].chunks, (2, 3)) self.assertEqual(z["q"].shards, (4, 6)) From 27f0b3516ca0e7e58fb98b549fcc18f816bc6d07 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 29 Aug 2025 08:58:57 +0100 Subject: [PATCH 10/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 27 ++++++++++++------- cfdm/read_write/netcdf/netcdfread.py | 23 ++++++++-------- cfdm/read_write/netcdf/netcdfwrite.py | 32 +++++++---------------- 3 files changed, 39 insertions(+), 43 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 612548fa1..ce69cce3e 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -1786,16 +1786,15 @@ def handle_reference_error(self, ref, context=None): def _dimensions(self, group): """Return dimensions that are defined in this group. - + .. versionadded:: (cfdm) NEXTVERSION :Parameters: - group: + group: :Returns: - """ match self._backend(): case "h5netcdf" | "netCDF4": @@ -1806,13 +1805,21 @@ def _dimensions(self, group): # print(group) if not hasattr(self, "_zarr_dims"): - # Cache the ZarrDimension objects, keyed by - # dimension basename. + # Mapping of dimension names to Dimension objects. + # + # E.g. {'x': , + # 'y': , + # '/forecast/y': } self._zarr_dims = {} if not hasattr(self, "_zarr_var_to_dims"): - # Cache each variable's ZarrDimension objects, - # keyed by the full-path variable name. + # Mapping of variable names to their Dimension objects. + # + # E.g. {'x': [], + # 'x_bnds': [, + # ], + # 'latitude_longitude': [], + # 'forecast/y': []} self._zarr_var_to_dims = {} dimensions = {} @@ -1837,7 +1844,7 @@ def _dimensions(self, group): self._zarr_dims.update(dimensions) # print(' dimensions =',dimensions) - # print(' self._zarr_dims =',tuple(self._zarr_dims)) + # print(' self._zarr_dims =',self._zarr_dims) # Map zarr variables to their dimension objects for v in group.array_values(): @@ -1850,7 +1857,9 @@ def _dimensions(self, group): self._zarr_dims[name] for name in dimension_names ] - # print(' self._zarr_var_to_dims=',tuple(self._zarr_var_to_dims)) + print( + " self._zarr_var_to_dims=", self._zarr_var_to_dims + ) return dimensions diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index ca6c7f480..b3aec9bb3 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -8185,15 +8185,7 @@ def _create_Data( # Deal with strings match g["original_dataset_opened_with"]: - case "zarr": - if array.dtype == np.dtypes.StringDType(): - array = array.astype("O", copy=False).astype( - "U", copy=False - ) - array = np.ma.masked_values(array, "") - - case _: - # h5netcdf | netCDF4 + case "h5netcdf" | "netCDF4": if array.dtype is None: if g["has_groups"]: group, name = self._netCDF4_group( @@ -8226,6 +8218,13 @@ def _create_Data( # array = np.ma.where(array == "", np.ma.masked, array) array = np.ma.masked_values(array, "") + case "zarr": + if array.dtype == np.dtypes.StringDType(): + array = array.astype("O", copy=False).astype( + "U", copy=False + ) + array = np.ma.masked_values(array, "") + # Set the dask chunking strategy chunks = self._dask_chunks( array, ncvar, compressed, construct_type=construct_type @@ -11014,8 +11013,8 @@ def _file_dimensions(self, nc): A dictionary of the dimensions keyed by their names. """ -# if hasattr(self, "_cached_file_dimensions"): -# return self._cached_file_dimensions + # if hasattr(self, "_cached_file_dimensions"): + # return self._cached_file_dimensions match self.read_vars["nc_opened_with"]: case "h5netcdf" | "netCDF4": @@ -11034,7 +11033,7 @@ def _file_dimensions(self, nc): } ) - # self._cached_file_dimensions = dimensions + # self._cached_file_dimensions = dimensions return dimensions diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 75d607435..d94be700b 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -109,11 +109,6 @@ def _createGroup(self, parent, group_name): group_name, overwrite=g["overwrite"] ) - case _: - raise ValueError( - f"Bad backend: {self.write_vars['backend']!r}" - ) # pragma: no cover - def _create_variable_name(self, parent, default): """Create an appropriate name for a dataset variable. @@ -361,8 +356,6 @@ def _set_attributes(self, attributes, ncvar=None, group=None): attributes[attr] = value.tolist() x.update_attributes(attributes) - case _: - raise ValueError(f"Bad backend: {g['backend']!r}") def _character_array(self, array): """Converts a numpy array of strings to character data type. @@ -553,10 +546,6 @@ def _createDimension(self, group, ncdim, size): case "zarr": # Dimensions are not created in Zarr datasets pass - case _: - raise ValueError( - f"Bad backend: {self.write_vars['backend']!r}" - ) # pragma: no cover def _dataset_dimensions(self, field, key, construct): """Returns the dataset dimension names for the construct. @@ -1835,7 +1824,8 @@ def _parent_group(self, name): ) for group_name in name.split("/")[1:-1]: - parent_group = self._createGroup(parent_group, group_name) + if group_name not in parent_group: + parent_group = self._createGroup(parent_group, group_name) return parent_group @@ -2690,9 +2680,6 @@ def _createVariable(self, **kwargs): variable = g["dataset"].create_array(**zarr_kwargs) print("___________") - case _: - raise ValueError(f"Bad backend: {g['backend']!r}") - g["nc"][ncvar] = variable def _write_grid_mapping(self, f, ref, multiple_grid_mappings): @@ -4699,7 +4686,7 @@ def _write_group_attributes(self, fields): g["group_attributes"] = group_attributes def _get_group(self, parent, groups): - """Get the group of *nc* defined by *groups*. + """Get the group of *parent* defined by *groups*. The group will be created if it doesn't already exist. @@ -4708,11 +4695,11 @@ def _get_group(self, parent, groups): :Parameters: parent: `netCDF4.Dateset` or `netCDF4.Group` or `Zarr.Group` - The group in which to find or create new group. + The group in which to find or create new group. groups: sequence of `str` The group defined by the sequence of its subgroups - realtive to *parent*, e.g. ``('forecast', 'model')``. + relative to *parent*, e.g. ``('forecast', 'model')``. :Returns: @@ -5038,11 +5025,12 @@ def dataset_open(self, dataset_name, mode, fmt, fields): ) raise - nc = zarr.group( - dataset_name, overwrite=g["overwrite"], zarr_format=3 + nc = zarr.create_group( + dataset_name, + overwrite=g["overwrite"], + zarr_format=3, + storage_options=g.get("storage_options"), ) - case _: - raise ValueError(f"Bad backend: {g['backend']!r}") return nc From 1bccf28f7046578286c4343488ab97ccce9d17f9 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 29 Aug 2025 15:48:08 +0100 Subject: [PATCH 11/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 132 +++++++++++----- cfdm/read_write/netcdf/netcdfwrite.py | 7 +- cfdm/test/test_groups.py | 1 + cfdm/test/test_zarr.py | 176 +++++++++++++++++++++- 4 files changed, 279 insertions(+), 37 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index ce69cce3e..9636eb133 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -317,6 +317,7 @@ def __init__( self._input_ds = input_ds self._output_ds = output_ds + # Record the backend that defines 'input_ds' if hasattr(input_ds, "_h5file"): self._input_ds_backend = "h5netcdf" elif hasattr(input_ds, "data_model"): @@ -324,7 +325,10 @@ def __init__( elif hasattr(input_ds, "store"): self._input_ds_backend = "zarr" else: - raise ValueError("TODOZARR") + raise ValueError( + "Unknown type of 'input_ds'. Must be one of h5netcdf.File, " + f"netCDF4.Dataset, or zarr.Group. Got {type(input_ds)}" + ) self._strict = bool(strict) self._omit_data = bool(omit_data) @@ -340,7 +344,7 @@ def __init__( "be different, and output should be of the 'NETCDF4' format." ) - def attrs(self, variable, backend=None): + def attrs(self, variable, dataset=None): """Return the variable attributes. .. versionadded:: (cfdm) 1.11.2.0 @@ -358,7 +362,7 @@ def attrs(self, variable, backend=None): names. """ - match self._backend(backend): + match self._backend(dataset): case "netCDF4": return { attr: variable.getncattr(attr) @@ -618,7 +622,7 @@ def group(self, x): # Dimension return x.group() - def name(self, x, backend=None): + def name(self, x, dataset=None): """Return the netCDF name, without its groups. .. versionadded:: (cfdm) 1.11.2.0 @@ -628,7 +632,7 @@ def name(self, x, backend=None): `str` """ - match self._backend(backend): + match self._backend(dataset): case "h5netcdf" | "netCDF4": return x.name.split(group_separator)[-1] @@ -841,7 +845,6 @@ def flatten_dimension(self, dim): ) # Write dimension - # print ('creating dimension:', new_name,) # '(org in', self.group(dim)) self._output_ds.createDimension( new_name, (len(dim), None)[dim.isunlimited()] ) @@ -906,7 +909,6 @@ def flatten_variable(self, var): else: fill_value = attributes.pop("_FillValue", None) - # print ('creating variable:', new_name)# '(org in', self.group(var)) new_var = self._output_ds.createVariable( new_name, self.dtype(var), @@ -1078,6 +1080,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): The absolute path to the reference. """ + print ('A', orig_ref, rules.name) ref = orig_ref absolute_ref = None ref_type = "" @@ -1105,7 +1108,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): ref_type = "dimension" else: ref_type = "variable" - + absolute_ref = self.search_by_relative_path( orig_ref, self.group(orig_var), resolve_dim_or_var ) @@ -1123,6 +1126,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): # Reference is to be searched by proximity else: + print (9999) method = "Proximity" absolute_ref, ref_type = self.resolve_reference_proximity( ref, @@ -1131,6 +1135,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): orig_var, rules, ) + print ('abs =', absolute_ref) # Post-search checks and return result return self.resolve_reference_post_processing( @@ -1193,14 +1198,14 @@ def resolve_reference_proximity( False, stop_at_local_apex, ) - +# print ( 'resolved_var = ',resolved_var ) # If failed and alternative possible, second tentative if resolved_var is None and resolve_alt: if resolve_dim_or_var: ref_type = "variable" else: ref_type = "dimension" - + print ('ref_type =' , ref_type) resolved_var = self.search_by_proximity( ref, self.group(orig_var), @@ -1217,8 +1222,9 @@ def resolve_reference_proximity( ), ref_type, ) - else: - return None, "" + + # Unresolved + return None, "" def resolve_reference_post_processing( self, absolute_ref, orig_ref, orig_var, rules, ref_type, method @@ -1268,7 +1274,7 @@ def resolve_reference_post_processing( elif absolute_ref is None: # Not found, so raise exception. absolute_ref = self.handle_reference_error( - orig_ref, self.path(self.group(orig_var)) + rules.name, orig_ref, self.path(self.group(orig_var)) ) else: # Found @@ -1395,21 +1401,23 @@ def search_by_proximity( `None`. """ + print ( 'search_dim=', search_dim, current_group) if search_dim: # dims_or_vars = current_group.dimensions # TODOZARR dims_or_vars = self._dimensions(current_group) + print (dims_or_vars) else: # dims_or_vars = current_group.variables dims_or_vars = self._variables(current_group) # Found in current group - if ref in dims_or_vars.keys(): + if ref in dims_or_vars: #.keys(): return dims_or_vars[ref] local_apex_reached = ( # local_apex_reached or ref in current_group.dimensions.keys() local_apex_reached - or ref in self._dimensions(current_group).keys() + or ref in dims_or_vars # TODOZARR self._dimensions(current_group).keys() ) # Check if have to continue looking in parent group @@ -1425,6 +1433,7 @@ def search_by_proximity( # Search up if not top_reached: + print ('not top_reached') return self.search_by_proximity( ref, # current_group.parent, @@ -1480,7 +1489,7 @@ def resolve_references(self, var, old_var): `None` """ - var_attrs = self.attrs(var, "netCDF4") + var_attrs = self.attrs(var, "output") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value parsed_attribute = parse_attribute(name, var_attrs[name]) @@ -1528,7 +1537,7 @@ def adapt_references(self, var): `None` """ - var_attrs = self.attrs(var, "netCDF4") + var_attrs = self.attrs(var, "output") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value value = var_attrs[name] @@ -1553,7 +1562,7 @@ def adapt_references(self, var): var.setncattr(name, new_attr_value) logging.info( - f" Value of {self.name(var, 'netCDF4')}.{name} " + f" Value of {self.name(var, 'output')}.{name} " f"changed from {value!r} to {new_attr_value!r}" ) @@ -1614,7 +1623,7 @@ def adapt_name(self, resolved_ref, rules): else: # If not found, raise exception - return self.handle_reference_error(resolved_ref) + return self.handle_reference_error(rules.name, resolved_ref) def pathname(self, group, name): """Compose full path name to an element in a group structure. @@ -1750,7 +1759,7 @@ def generate_flattened_name(self, input_group, orig_name): return new_name - def handle_reference_error(self, ref, context=None): + def handle_reference_error(self, role, ref, context=None): """Handle reference error. Depending on the `_strict` mode, either raise an exception or @@ -1761,8 +1770,12 @@ def handle_reference_error(self, ref, context=None): :Parameters: + role: `str` + The CF role of the reference, + e.g. ``'instance_dimension'``, ``'cell_measures'``. + ref: `str` - The reference + The reference. context: `str` Additional context information to add to message. @@ -1774,7 +1787,7 @@ def handle_reference_error(self, ref, context=None): `UnresolvedReferenceException` is raised. """ - message = f"Reference {ref!r} could not be resolved" + message = f"{role} reference {ref!r} could not be resolved" if context is not None: message = f"{message} from {context}" @@ -1792,9 +1805,13 @@ def _dimensions(self, group): :Parameters: group: + The group to inspect. :Returns: + `dict`-like + The dimensions, keyed by their names. + """ match self._backend(): case "h5netcdf" | "netCDF4": @@ -1803,12 +1820,12 @@ def _dimensions(self, group): case "zarr": from ..zarr import ZarrDimension - # print(group) if not hasattr(self, "_zarr_dims"): # Mapping of dimension names to Dimension objects. # # E.g. {'x': , # 'y': , + # 'bounds2': , # '/forecast/y': } self._zarr_dims = {} @@ -1839,14 +1856,14 @@ def _dimensions(self, group): zd = ZarrDimension(basename, size, group) dimensions[basename] = zd - self._zarr_dims[name] = zd + self._zarr_dims[name] = zd # TODOZARR RESOLVE NAME? self._zarr_dims.update(dimensions) # print(' dimensions =',dimensions) - # print(' self._zarr_dims =',self._zarr_dims) +# print(' self._zarr_dims =',list(self._zarr_dims)) - # Map zarr variables to their dimension objects + # Map variables to their dimension objects for v in group.array_values(): dimension_names = v.metadata.dimension_names if dimension_names is None: @@ -1857,14 +1874,27 @@ def _dimensions(self, group): self._zarr_dims[name] for name in dimension_names ] - print( - " self._zarr_var_to_dims=", self._zarr_var_to_dims - ) + # print(" self._zarr_var_to_dims=", self._zarr_var_to_dims) + print('durrent group dimensions', dimensions) return dimensions def _variables(self, group): - """Return variables that are defined in this group.""" + """Return variables that are defined in this group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group to inspect. + + :Returns: + + `dict`-like + The variables, keyed by their names. + + """ match self._backend(): case "h5netcdf" | "netCDF4": return group.variables @@ -1873,7 +1903,21 @@ def _variables(self, group): return dict(group.arrays()) def _child_groups(self, group): - """Return groups that are defined in this group.""" + """Return groups that are defined in this group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group to inspect. + + :Returns: + + `dict`-like + The groups, keyed by their names. + + """ match self._backend(): case "h5netcdf" | "netCDF4": return group.groups @@ -1881,12 +1925,32 @@ def _child_groups(self, group): case "zarr": return dict(group.groups()) - def _backend(self, name=None): - if name is None: + def _backend(self, dataset=None): + """Return the name of the backend that defines a dataset. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `str` or `None` + If set to ``'output'`` then the name of the output + dateset backend will be returned. If `None` (the + default) then the name of backend that defines the + input dataset is returned. + + :Returns: + + `str` + The backend name. + + """ + if dataset is None: return self._input_ds_backend - return name + if dataset == "output": + return "netCDF4" + raise("Bad value of 'dataset'") class AttributeParsingException(Exception): """Exception for unparsable attribute. diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index d94be700b..1ef2c47b8 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -105,6 +105,9 @@ def _createGroup(self, parent, group_name): return parent.createGroup(group_name) case "zarr": + if group_name in parent: + return parent[group_name] + return parent.create_group( group_name, overwrite=g["overwrite"] ) @@ -1824,8 +1827,8 @@ def _parent_group(self, name): ) for group_name in name.split("/")[1:-1]: - if group_name not in parent_group: - parent_group = self._createGroup(parent_group, group_name) + # if group_name not in parent_group: + parent_group = self._createGroup(parent_group, group_name) return parent_group diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index 5fd99f608..c94a9787f 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -305,6 +305,7 @@ def test_groups_geometry(self): g.nc_set_component_variable("interior_ring", "interior_ring") g.nc_set_component_variable_groups("interior_ring", ["forecast"]) + grouped_file='grouped_file.nc' cfdm.write(g, grouped_file) # Check that the variable is in the right group diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index e88b4ae8c..da1510260 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -77,7 +77,6 @@ def test_zarr_read_write_1(self): self.assertTrue(z.equals(f)) # Check that the Zarr and netCDF4 encodings are equivalent - tmpfile1 = "delme.nc" cfdm.write(f, tmpfile1, fmt="NETCDF4") n = cfdm.read(tmpfile1)[0] self.assertTrue(z.equals(n)) @@ -137,6 +136,7 @@ def test_zarr_read_write_chunks_shards(self): def test_zarr_read_write_CFA(self): """Test CF aggreagtion in Zarr.""" f = self.f0 + cfdm.write(f, tmpdir1, fmt="ZARR3") cfdm.write(f, tmpfile1, fmt="NETCDF4") @@ -155,6 +155,180 @@ def test_zarr_read_write_CFA(self): self.assertTrue(z.equals(f)) self.assertTrue(z.equals(n)) + def test_zarr_groups(self): + """Test for the general handling of Zarr hierarchical groups.""" + f = cfdm.example_field(1) + + # Add a second grid mapping + datum = cfdm.Datum(parameters={"earth_radius": 7000000}) + conversion = cfdm.CoordinateConversion( + parameters={"grid_mapping_name": "latitude_longitude"} + ) + + grid = cfdm.CoordinateReference( + coordinate_conversion=conversion, + datum=datum, + coordinates=["auxiliarycoordinate0", "auxiliarycoordinate1"], + ) + + f.set_construct(grid) + + grid0 = f.construct("grid_mapping_name:rotated_latitude_longitude") + grid0.del_coordinate("auxiliarycoordinate0") + grid0.del_coordinate("auxiliarycoordinate1") + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.construct("grid_latitude").bounds.nc_set_variable_groups(["forecast"]) + for name in ( + "longitude", # Auxiliary coordinate + "latitude", # Auxiliary coordinate + "long_name=Grid latitude name", # Auxiliary coordinate + "measure:area", # Cell measure + "surface_altitude", # Domain ancillary + "air_temperature standard_error", # Field ancillary + "grid_mapping_name:rotated_latitude_longitude", + "time", # Dimension coordinate + "grid_latitude", # Dimension coordinate + ): + f.construct(name).nc_set_variable_groups(["forecast"]) + + # Check the groups + cfdm.write(f, grouped_file, fmt='NETCDF4') + cfdm.write(f, grouped_dir, fmt='ZARR3') + + n = cfdm.read(grouped_file)[0] + z = cfdm.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + # Directly check the groups in the Zarr dataset + x = zarr.open(grouped_dir) + self.assertEqual(list(x.group_keys()), ['forecast']) + self.assertEqual(list(x['forecast'].group_keys()), ['model']) + + cfdm.write(z, tmpdir2, fmt='ZARR3') + z1 = cfdm.read(tmpdir2)[0] + self.assertTrue(z1.equals(f)) + + def test_zarr_groups_dimension(self): + """Test the dimensions of Zarr hierarchical groups.""" + f = self.f0.copy() + + grouped_dir = tmpdir1 + grouped_file = tmpfile1 + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + for construct in f.constructs.filter_by_data().values(): + construct.nc_set_variable_groups(["forecast"]) + + for construct in f.coordinates().values(): + try: + construct.bounds.nc_set_variable_groups(["forecast"]) + except ValueError: + pass + + domain_axis = f.domain_axis("latitude") + domain_axis.nc_set_dimension_groups(["forecast"]) + + # Check the groups + cfdm.write(f, grouped_file, fmt='NETCDF4') + cfdm.write(f, grouped_dir, fmt='ZARR3') + + n = cfdm.read(grouped_file)[0] + z = cfdm.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + def test_zarr_groups_compression(self): + """Test the compression of Zarr hierarchical groups.""" + f = cfdm.example_field(4) + + grouped_dir = 'tmpdir1' + grouped_file = 'tmpfile1.nc' + + f.compress("indexed_contiguous", inplace=True) + f.data.get_count().nc_set_variable("count") + f.data.get_index().nc_set_variable("index") + + + # Set some groups. (Write the read the field first to create + # the compressions variables on disk.) + cfdm.write(f, tmpfile2) + f = cfdm.read(tmpfile2)[0] + + # ------------------------------------------------------------ + # Move the field construct to the /forecast/model group + # ------------------------------------------------------------ + f.nc_set_variable_groups(["forecast", "model"]) + + # ------------------------------------------------------------ + # Move the count variable to the /forecast group + # ------------------------------------------------------------ + f.data.get_count().nc_set_variable_groups(["forecast"]) + + # ------------------------------------------------------------ + # Move the index variable to the /forecast group + # ------------------------------------------------------------ + f.data.get_index().nc_set_variable_groups(["forecast"]) + + # ------------------------------------------------------------ + # Move the coordinates that span the element dimension to the + # /forecast group + # ------------------------------------------------------------ + f.construct("altitude").nc_set_variable_groups(["forecast"]) + + # ------------------------------------------------------------ + # Move the sample dimension to the /forecast group + # ------------------------------------------------------------ + f.data.get_count().nc_set_sample_dimension_groups(["forecast"]) + print(f) + cfdm.write(f, grouped_file, fmt='NETCDF4') + cfdm.write(f, grouped_dir, fmt='ZARR3') + + n = cfdm.read(grouped_file) + z = cfdm.read(grouped_dir) + print(n) + print (z) + + n = n[0] + z = z[0] + self.assertTrue(z.equals(n, verbose=-1)) + self.assertTrue(z.equals(f)) + + def test_zarr_groups_geometry(self): + """Test that geometries are considered in the correct Zarr groups.""" + f = cfdm.example_field(6) + + grouped_dir = tmpdir1 + grouped_file = tmpfile1.nc + + cfdm.write(f, tmpfile2) + f = cfdm.read(tmpfile2)[0] + + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) + f.nc_set_geometry_variable_groups(["forecast"]) + f.coordinate("longitude").bounds.nc_set_variable_groups(["forecast"]) + f.nc_set_component_variable_groups("node_count", ["forecast"]) + f.nc_set_component_variable_groups("part_node_count", ["forecast"]) + f.nc_set_component_variable("interior_ring", "interior_ring") + f.nc_set_component_variable_groups("interior_ring", ["forecast"]) + + # Check the groups + cfdm.write(f, grouped_file, fmt='NETCDF4') + cfdm.write(f, grouped_dir, fmt='ZARR3') + + n = cfdm.read(grouped_file)[0] + z = cfdm.read(grouped_dir)[0] + self.assertTrue(z.equals(n)) + self.assertTrue(z.equals(f)) + + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From ed5804dfe5f0ec561da6495c4bdd9d491ec56c98 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 29 Aug 2025 22:39:25 +0100 Subject: [PATCH 12/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 73 +++++++++++------------ cfdm/test/test_zarr.py | 2 +- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 9636eb133..33855c86d 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -405,7 +405,7 @@ def chunksizes(self, variable): case "netCDF4": chunking = variable.chunking() if chunking == "contiguous": - return None + return return chunking @@ -543,7 +543,7 @@ def get_dims(self, variable): :Returns: - `list` + `list` of dimension objects """ match self._backend(): @@ -666,23 +666,23 @@ def ncattrs(self, x): return x.ncattrs() def parent(self, group): - """Return a simulated unix parent group. + """Return the parent group. .. versionadded:: (cfdm) 1.11.2.0 :Returns: `Group` or `None` - The parent grup, or `None` if *group* is the root + The parent group, or `None` if *group* is the root group (and so has no parent). """ match self._backend(): case "h5netcdf" | "netCDF4": - try: - return group.parent - except AttributeError: - return +# try: + return group.parent +# except AttributeError: +# return case "zarr": name = group.name @@ -769,7 +769,6 @@ def process_group(self, input_group): for dim in self._dimensions(input_group).values(): self.flatten_dimension(dim) - # for var in input_group.variables.values(): # self.flatten_variable(var) @@ -1080,7 +1079,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): The absolute path to the reference. """ - print ('A', orig_ref, rules.name) +# print ('\nA', orig_ref, rules.name) ref = orig_ref absolute_ref = None ref_type = "" @@ -1126,7 +1125,6 @@ def resolve_reference(self, orig_ref, orig_var, rules): # Reference is to be searched by proximity else: - print (9999) method = "Proximity" absolute_ref, ref_type = self.resolve_reference_proximity( ref, @@ -1135,7 +1133,6 @@ def resolve_reference(self, orig_ref, orig_var, rules): orig_var, rules, ) - print ('abs =', absolute_ref) # Post-search checks and return result return self.resolve_reference_post_processing( @@ -1198,14 +1195,14 @@ def resolve_reference_proximity( False, stop_at_local_apex, ) -# print ( 'resolved_var = ',resolved_var ) + # If failed and alternative possible, second tentative if resolved_var is None and resolve_alt: if resolve_dim_or_var: ref_type = "variable" else: ref_type = "dimension" - print ('ref_type =' , ref_type) + resolved_var = self.search_by_proximity( ref, self.group(orig_var), @@ -1333,26 +1330,30 @@ def search_by_relative_path(self, ref, current_group, search_dim): """ # Go up parent groups while ref.startswith("../"): - if current_group.parent is None: - return None + parent = self.parent(current_group) + if parent is None: + return ref = ref[3:] - current_group = current_group.parent + current_group = parent # Go down child groups ref_split = ref.split(group_separator) for g in ref_split[:-1]: try: - current_group = current_group.groups[g] +# current_group = current_group.groups[g] + current_group = self._child_groups(current_group)[g] except KeyError: - return None + return # Get variable or dimension if search_dim: - elt = current_group.dimensions[ref_split[-1]] +# elt = current_group.dimensions[ref_split[-1]] + elt = tuple(self._dimensions(current_group))[ref_split[-1]] + else: # elt = current_group.variables[ref_split[-1]] - elt = current_group.variables[ref_split[-1]] + elt = tuple(self._variables(current_group))[ref_split[-1]] # Get absolute reference return self.pathname(self.group(elt), self.name(elt)) @@ -1401,11 +1402,9 @@ def search_by_proximity( `None`. """ - print ( 'search_dim=', search_dim, current_group) if search_dim: # dims_or_vars = current_group.dimensions # TODOZARR dims_or_vars = self._dimensions(current_group) - print (dims_or_vars) else: # dims_or_vars = current_group.variables dims_or_vars = self._variables(current_group) @@ -1415,7 +1414,7 @@ def search_by_proximity( return dims_or_vars[ref] local_apex_reached = ( - # local_apex_reached or ref in current_group.dimensions.keys() + # local_apex_reached or ref in current_group.dimensions.keys() local_apex_reached or ref in dims_or_vars # TODOZARR self._dimensions(current_group).keys() ) @@ -1433,10 +1432,8 @@ def search_by_proximity( # Search up if not top_reached: - print ('not top_reached') return self.search_by_proximity( ref, - # current_group.parent, parent_group, search_dim, local_apex_reached, @@ -1445,9 +1442,10 @@ def search_by_proximity( elif is_coordinate_variable and local_apex_reached: # Coordinate variable and local apex reached, so search - # down in siblings + # down in siblings. found_elt = None - for child_group in current_group.groups.values(): +# for child_group in current_group.groups.values(): + for child_group in self._child_groups(current_group).values(): found_elt = self.search_by_proximity( ref, child_group, @@ -1460,9 +1458,8 @@ def search_by_proximity( return found_elt - else: - # Did not find - return None + # Did not find + return def resolve_references(self, var, old_var): """Resolve references. @@ -1819,7 +1816,7 @@ def _dimensions(self, group): case "zarr": from ..zarr import ZarrDimension - + print('Group:', repr(self.name(group))) if not hasattr(self, "_zarr_dims"): # Mapping of dimension names to Dimension objects. # @@ -1839,6 +1836,8 @@ def _dimensions(self, group): # 'forecast/y': []} self._zarr_var_to_dims = {} + # Loop round this group's variables, finding the + # dimension_names for each one. dimensions = {} for v in group.array_values(): dimension_names = v.metadata.dimension_names @@ -1860,10 +1859,10 @@ def _dimensions(self, group): self._zarr_dims.update(dimensions) - # print(' dimensions =',dimensions) -# print(' self._zarr_dims =',list(self._zarr_dims)) + print(' self._zarr_dims =',self._zarr_dims) - # Map variables to their dimension objects + # Map this group's variables to their dimension + # objects for v in group.array_values(): dimension_names = v.metadata.dimension_names if dimension_names is None: @@ -1874,9 +1873,9 @@ def _dimensions(self, group): self._zarr_dims[name] for name in dimension_names ] - # print(" self._zarr_var_to_dims=", self._zarr_var_to_dims) + print(" self._zarr_var_to_dims=", self._zarr_var_to_dims) - print('durrent group dimensions', dimensions) + print('current group dimensions', dimensions) return dimensions def _variables(self, group): diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index da1510260..6eb34c23d 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -274,7 +274,7 @@ def test_zarr_groups_compression(self): # ------------------------------------------------------------ # Move the index variable to the /forecast group # ------------------------------------------------------------ - f.data.get_index().nc_set_variable_groups(["forecast"]) +# f.data.get_index().nc_set_variable_groups(["forecast"]) # ------------------------------------------------------------ # Move the coordinates that span the element dimension to the From 0ab6a871a341ab0da6cdd1d3aa476ff27a19a146 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sat, 30 Aug 2025 19:12:17 +0100 Subject: [PATCH 13/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 372 +++++++++++++--------- cfdm/test/test_groups.py | 2 +- cfdm/test/test_zarr.py | 81 ++--- 3 files changed, 253 insertions(+), 202 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 33855c86d..f22cb788b 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -18,6 +18,7 @@ import re import warnings +from ....functions import is_log_level_debug from .config import ( flattener_attribute_map, flattener_dimension_map, @@ -29,6 +30,8 @@ ref_not_found_error, ) +logger = logging.getLogger(__name__) + # Mapping from numpy dtype endian format to that expected by netCDF4 _dtype_endian_lookup = { "=": "native", @@ -50,7 +53,7 @@ def netcdf_flatten( omit_data=False, write_chunksize=134217728, ): - """Create a flattened version of a grouped netCDF dataset. + """Create a flattened version of a grouped dataset. **CF-netCDF coordinate variables** @@ -84,7 +87,7 @@ def netcdf_flatten( input_ds: The dataset to be flattened, that has the same API as - `netCDF4.Dataset` or `h5netcdf.File`. + `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` output_ds: `netCDF4.Dataset` A container for the flattened dataset. @@ -232,7 +235,7 @@ def generate_var_attr_str(d): class _Flattener: - """Information and methods needed to flatten a netCDF dataset. + """Information and methods needed to flatten a dataset. Contains the input file, the output file being flattened, and all the logic of the flattening process. @@ -255,7 +258,7 @@ def __init__( input_ds: The dataset to be flattened, that has the same API as - `netCDF4.Dataset` or `h5netcdf.File`. + `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group`. output_ds: `netCDF4.Dataset` A container for the flattened dataset. @@ -277,7 +280,7 @@ def __init__( self._attr_map_value = [] # Mapping of flattened dimension names to their full-path - # counterparts. + # counterparts # # E.g. ['bounds2: /bounds2', # 'x: /x', @@ -285,7 +288,7 @@ def __init__( self._dim_map_value = [] # Mapping of flattened variable names to their full-path - # counterparts. + # counterparts # # E.g. ['x_bnds: /x_bnds', # 'x: /x', @@ -296,7 +299,7 @@ def __init__( self._var_map_value = [] # Mapping of full-path dimension names to their flattened - # counterparts. + # counterparts # # E.g. {'/bounds2': 'bounds2', # '/x': 'x', @@ -304,7 +307,7 @@ def __init__( self._dim_map = {} # Mapping of full-path variable names to their flattened - # counterparts. + # counterparts # # E.g. {'/x_bnds': 'x_bnds', # '/x': 'x', @@ -314,6 +317,24 @@ def __init__( # '/forecast/y': 'forecast__y'} self._var_map = {} + # Mapping of full-path group names to the dimensions defined + # therein + # + # E.g. {'/': {'feature': , + # 'station': }, + # '/forecast': {'element': }, + # '/forecast/model': {}} + self._group_to_dims = {} + + # Mapping of variable names to their Dimension objects. + # + # E.g. {'x': [], + # 'x_bnds': [, + # ], + # 'latitude_longitude': [], + # 'forecast/y': []} + self._var_to_dims = {} + self._input_ds = input_ds self._output_ds = output_ds @@ -344,6 +365,8 @@ def __init__( "be different, and output should be of the 'NETCDF4' format." ) + self._debug = is_log_level_debug(logger) + def attrs(self, variable, dataset=None): """Return the variable attributes. @@ -352,8 +375,7 @@ def attrs(self, variable, dataset=None): :Parameters: variable: - The variable, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The variable object. :Returns: @@ -380,8 +402,7 @@ def chunksizes(self, variable): :Parameters: variable: - The variable, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The variable object. :Returns: @@ -417,8 +438,7 @@ def contiguous(self, variable): :Parameters: variable: - The variable, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The variable object. :Returns: @@ -447,8 +467,7 @@ def dtype(self, variable): :Parameters: variable: - The variable, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The variable object. :Returns: @@ -480,8 +499,7 @@ def endian(self, variable): :Parameters: variable: - The variable, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The variable object. :Returns: @@ -511,8 +529,7 @@ def filepath(self, dataset): :Parameters: dataset: - The dataset, that has the same API as - `netCDF4.Dataset` or `h5netcdf.File`. + The dataset. :Returns: @@ -571,7 +588,7 @@ def get_dims(self, variable): return [dims[name] for name in variable.dimensions] case "zarr": - return tuple(self._zarr_var_to_dims[variable.path]) + return tuple(self._var_to_dims[variable.path]) def getncattr(self, x, attr): """Retrieve a netCDF attribute. @@ -679,10 +696,7 @@ def parent(self, group): """ match self._backend(): case "h5netcdf" | "netCDF4": -# try: return group.parent -# except AttributeError: -# return case "zarr": name = group.name @@ -698,6 +712,11 @@ def path(self, group): .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + group: + The group object. + :Returns: `str` @@ -726,7 +745,10 @@ def flatten(self): input_ds = self._input_ds output_ds = self._output_ds - logging.info(f"Flattening the groups of {self.filepath(input_ds)}") + if self._debug: + logger.debug( + f"Flattening the groups of {self.filepath(input_ds)}" + ) # pragma: no cover # Flatten product self.process_group(input_ds) @@ -737,10 +759,12 @@ def flatten(self): output_ds.setncattr(flattener_variable_map, self._var_map_value) # Browse flattened variables to rename references: - logging.info( - " Browsing flattened variables to rename references " - "in attributes" - ) + if self._debug: + logger.debug( + " Browsing flattened variables to rename references " + "in attributes" + ) # pragma: no cover + for var in output_ds.variables.values(): self.adapt_references(var) @@ -751,15 +775,18 @@ def process_group(self, input_group): :Parameters: - input_group: `str` - The group to flatten. + input_group: + The group object to flatten. :Returns: `None` """ - logging.info(f" Browsing group {self.path(input_group)}") + if self._debug: + logger.debug( + f" Browsing group {self.path(input_group)}" + ) # pragma: no cover for attr_name in self.ncattrs(input_group): self.flatten_attribute(input_group, attr_name) @@ -788,8 +815,8 @@ def flatten_attribute(self, input_group, attr_name): :Parameters: - input_group: `str` - The group containing the attribute to flatten. + input_group: + The group object containing the attribute to flatten. attr_name: `str` The name of the attribute. @@ -799,14 +826,15 @@ def flatten_attribute(self, input_group, attr_name): `None` """ - logging.info( - f" Copying attribute {attr_name} from " - f"group {self.path(input_group)} to root" - ) - # Create new name new_attr_name = self.generate_flattened_name(input_group, attr_name) + if self._debug: + logger.debug( + f" Creating global attribute {new_attr_name!r} from " + f"group {self.path(input_group)}" + ) # pragma: no cover + # Write attribute self._output_ds.setncattr( new_attr_name, self.getncattr(input_group, attr_name) @@ -825,23 +853,28 @@ def flatten_dimension(self, dim): :Parameters: dim: - The dimension to flatten, that has the same API as - `netCDF4.Dimension` or `h5netcdf.Dimension`. + The dimension object to flatten. :Returns: `None` """ - logging.info( - f" Copying dimension {self.name(dim)} from " - f"group {self.path(self.group(dim))} to root" - ) + # logger.debug( + # f" Creating dimension {self.name(dim)!r} from " + # f"group {self.path(self.group(dim))} to root" + # ) # Create new name - new_name = self.generate_flattened_name( - self.group(dim), self.name(dim) - ) + group = self.group(dim) + name = self.name(dim) + new_name = self.generate_flattened_name(group, name) + + if self._debug: + logger.debug( + f" Creating dimension {new_name!r} from " + f"group {self.path(group)}" + ) # pragma: no cover # Write dimension self._output_ds.createDimension( @@ -849,15 +882,11 @@ def flatten_dimension(self, dim): ) # Store new name in dict for resolving references later - self._dim_map[self.pathname(self.group(dim), self.name(dim))] = ( - new_name - ) + self._dim_map[self.pathname(group, name)] = new_name # Add to name mapping attribute self._dim_map_value.append( - self.generate_mapping_str( - self.group(dim), self.name(dim), new_name - ) + self.generate_mapping_str(group, name, new_name) ) def flatten_variable(self, var): @@ -868,18 +897,17 @@ def flatten_variable(self, var): :Parameters: var: - The variable, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The variable object. :Returns: `None` """ - logging.info( - f" Copying variable {self.name(var)} from " - f"group {self.path(self.group(var))} to root" - ) + # logger.debug( + # f" Copying variable {self.name(var)!r} from " + # f"group {self.path(self.group(var))} to root" + # ) # Create new name new_name = self.generate_flattened_name( @@ -897,8 +925,11 @@ def flatten_variable(self, var): ) # Write variable - fullname = self.pathname(self.group(var), self.name(var)) - logging.info(f" Creating variable {new_name} from {fullname}") + if self._debug: + logger.debug( + f" Creating variable {new_name!r} from " + f"{self.pathname(self.group(var), self.name(var))!r}" + ) # pragma: no cover attributes = self.attrs(var) @@ -1009,13 +1040,11 @@ def write_data_in_chunks(self, old_var, new_var): :Parameters: old_var: - The variable where the data should be copied from, - that has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The variable object where the data should be copied + from. new_var: - The new variable in which to copy the data, that has the - same API as `netCDF4.Variable` or `h5netcdf.Variable`. + The new variable object in which to copy the data. :Returns: @@ -1028,10 +1057,12 @@ def write_data_in_chunks(self, old_var, new_var): (self.write_chunksize // (old_var.dtype.itemsize * ndim)), ) * ndim - logging.info( - f" Copying {self.name(old_var)!r} data in chunks of " - f"{chunk_shape}" - ) + if self._debug: + logger.debug( + f" Copying {self.name(old_var)!r} data in chunks of " + f"{chunk_shape}" + ) # pragma: no cover + # Initial position vector pos = [0] * ndim @@ -1066,9 +1097,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): The reference to resolve. orig_var: - The original variable containing the reference, that - has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object containing the reference. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1079,7 +1108,6 @@ def resolve_reference(self, orig_ref, orig_var, rules): The absolute path to the reference. """ -# print ('\nA', orig_ref, rules.name) ref = orig_ref absolute_ref = None ref_type = "" @@ -1107,7 +1135,7 @@ def resolve_reference(self, orig_ref, orig_var, rules): ref_type = "dimension" else: ref_type = "variable" - + absolute_ref = self.search_by_relative_path( orig_ref, self.group(orig_var), resolve_dim_or_var ) @@ -1165,9 +1193,7 @@ def resolve_reference_proximity( and vice versa. orig_var: - The original variable containing the reference, that - has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object containing the reference. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1239,9 +1265,7 @@ def resolve_reference_post_processing( The original reference. orig_var: - The original variable containing the reference, that - has the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object containing the reference. rules: `FlatteningRules` The flattening rules that apply to the reference. @@ -1262,10 +1286,12 @@ def resolve_reference_post_processing( """ # If not found and accept standard name, assume standard name if absolute_ref is None and rules.accept_standard_names: - logging.info( - f" Reference to {orig_ref!r} not " - "resolved. Assumed to be a standard name." - ) + if self._debug: + logger.debug( + f" Reference to {orig_ref!r} not " + "resolved. Assumed to be a standard name." + ) # pragma: no cover + ref_type = "standard_name" absolute_ref = orig_ref elif absolute_ref is None: @@ -1275,10 +1301,11 @@ def resolve_reference_post_processing( ) else: # Found - logging.info( - f" {method} reference to {ref_type} " - f"{orig_ref!r} resolved as {absolute_ref!r}" - ) + if self._debug: + logger.debug( + f" {method} reference to {ref_type} " + f"{orig_ref!r} resolved as {absolute_ref!r}" + ) # pragma: no cover # If variables refs are limited to coordinate variable, # additional check @@ -1293,10 +1320,12 @@ def resolve_reference_post_processing( or self._input_ds[absolute_ref].ndim > 0 ) ): - logging.info( - f" Reference to {orig_ref!r} is not a " - "scalar coordinate variable. Assumed to be a standard name." - ) + if self._debug: + logger.debug( + f" Reference to {orig_ref!r} is not a scalar " + "coordinate variable. Assumed to be a standard name." + ) # pragma: no cover + absolute_ref = orig_ref # Return result @@ -1315,8 +1344,8 @@ def search_by_relative_path(self, ref, current_group, search_dim): ref: `str` The reference to resolve. - current_group: `str` - The current group of the reference. + current_group: + The current group object of the reference. search_dim: `bool` If True then search for a dimension, otherwise a @@ -1330,7 +1359,7 @@ def search_by_relative_path(self, ref, current_group, search_dim): """ # Go up parent groups while ref.startswith("../"): - parent = self.parent(current_group) + parent = self.parent(current_group) if parent is None: return @@ -1341,18 +1370,15 @@ def search_by_relative_path(self, ref, current_group, search_dim): ref_split = ref.split(group_separator) for g in ref_split[:-1]: try: -# current_group = current_group.groups[g] current_group = self._child_groups(current_group)[g] except KeyError: return # Get variable or dimension if search_dim: -# elt = current_group.dimensions[ref_split[-1]] elt = tuple(self._dimensions(current_group))[ref_split[-1]] else: - # elt = current_group.variables[ref_split[-1]] elt = tuple(self._variables(current_group))[ref_split[-1]] # Get absolute reference @@ -1383,7 +1409,7 @@ def search_by_proximity( The reference to resolve. current_group: - The current group where searching. + The current group object where searching. search_dim: `bool` If True then search for a dimension, otherwise a @@ -1410,13 +1436,14 @@ def search_by_proximity( dims_or_vars = self._variables(current_group) # Found in current group - if ref in dims_or_vars: #.keys(): + if ref in dims_or_vars: # .keys(): return dims_or_vars[ref] local_apex_reached = ( # local_apex_reached or ref in current_group.dimensions.keys() local_apex_reached - or ref in dims_or_vars # TODOZARR self._dimensions(current_group).keys() + or ref + in dims_or_vars # TODOZARR self._dimensions(current_group).keys() ) # Check if have to continue looking in parent group @@ -1444,7 +1471,7 @@ def search_by_proximity( # Coordinate variable and local apex reached, so search # down in siblings. found_elt = None -# for child_group in current_group.groups.values(): + # for child_group in current_group.groups.values(): for child_group in self._child_groups(current_group).values(): found_elt = self.search_by_proximity( ref, @@ -1472,14 +1499,11 @@ def resolve_references(self, var, old_var): :Parameters: var: - The flattened variable in which references should be - renamed with absolute references, that has the same - API as `netCDF4.Variable` or `h5netcdf.Variable`. + The flattened variable object in which references + should be renamed with absolute references. old_var: - The original variable (in group structure), that has - the same API as `netCDF4.Variable` or - `h5netcdf.Variable`. + The original variable object (in group structure). :Returns: @@ -1525,9 +1549,8 @@ def adapt_references(self, var): :Parameters: var: - The flattened variable in which references should be - renamed with new names, that has the same API as - `netCDF4.Variable` or `h5netcdf.Variable`. + The flattened variable object in which references + should be renamed with new names. :Returns: @@ -1558,10 +1581,12 @@ def adapt_references(self, var): new_attr_value = generate_var_attr_str(adapted_parsed_attr) var.setncattr(name, new_attr_value) - logging.info( - f" Value of {self.name(var, 'output')}.{name} " - f"changed from {value!r} to {new_attr_value!r}" - ) + if self._debug: + logger.debug( + " Value of attribute " + f"{self.name(var, 'output')}.{name} " + f"changed from {value!r} to {new_attr_value!r}" + ) # pragma: no cover def adapt_name(self, resolved_ref, rules): """Apapt the name. @@ -1571,6 +1596,8 @@ def adapt_name(self, resolved_ref, rules): .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + resolved_ref: `str` The resolved reference. @@ -1622,7 +1649,7 @@ def adapt_name(self, resolved_ref, rules): # If not found, raise exception return self.handle_reference_error(rules.name, resolved_ref) - def pathname(self, group, name): + def pathname(self, group, name=None): """Compose full path name to an element in a group structure. .. versionadded:: (cfdm) 1.11.2.0 @@ -1630,7 +1657,7 @@ def pathname(self, group, name): :Parameters: current_group: - The group containing the dimension or variable. + The group object containing the dimension or variable. name: `str` The name of the dimension or variable. @@ -1643,7 +1670,7 @@ def pathname(self, group, name): """ if self.parent(group) is None: return group_separator + name - + return group_separator.join((self.path(group), name)) def generate_mapping_str(self, input_group, name, new_name): @@ -1657,8 +1684,8 @@ def generate_mapping_str(self, input_group, name, new_name): :Parameters: input_group: - The group containing the non-flattened dimension or - variable. + The group object containing the non-flattened + dimension or variable. name: `str` The name of the non-flattened dimension or variable. @@ -1716,7 +1743,7 @@ def generate_flattened_name(self, input_group, orig_name): :Parameters: input_group: - The group containing the dimension or variable. + The group object containing the dimension or variable. orig_name: `str` The original name of the dimension or variable. @@ -1802,7 +1829,7 @@ def _dimensions(self, group): :Parameters: group: - The group to inspect. + The group object. :Returns: @@ -1815,27 +1842,25 @@ def _dimensions(self, group): return group.dimensions case "zarr": + group_name = self.path(group) + # print('Group:', repr( group_name )) + dimensions = self._group_to_dims.get(group_name) + if dimensions is not None: + # We've already found this group's dimensions + return dimensions + from ..zarr import ZarrDimension - print('Group:', repr(self.name(group))) + if not hasattr(self, "_zarr_dims"): - # Mapping of dimension names to Dimension objects. + # Mapping of dimension names *as they appear in + # the Zarr dataset* to Dimension objects. # # E.g. {'x': , # 'y': , # 'bounds2': , # '/forecast/y': } self._zarr_dims = {} - - if not hasattr(self, "_zarr_var_to_dims"): - # Mapping of variable names to their Dimension objects. - # - # E.g. {'x': [], - # 'x_bnds': [, - # ], - # 'latitude_longitude': [], - # 'forecast/y': []} - self._zarr_var_to_dims = {} - + # Loop round this group's variables, finding the # dimension_names for each one. dimensions = {} @@ -1849,19 +1874,62 @@ def _dimensions(self, group): if name in self._zarr_dims: continue - basename = name.split(group_separator)[-1] - if basename in dimensions: + name_split = name.split(group_separator) + basename = name_split[-1] + + #if group_separator in name: + # if name.startswith(group_separator): + # g = group_separator.join(name_split[:-1]) + # if g == group_name: + # # Dimension is defined in the + # # current group + # name = basename + # else: + # for i in range(2, ???): + # g = group_separator.join(name_split[:-i]) + # if g in self._group_to_dims and basename in self._group_to_dims.setdefault(, {}): + # + # + # + # + # if dim_group is this group: + # name = basename + # elif dim exissts in dim_group: + # continue + # + #if name == basename and name in dimensions: + # # Already seen this dim in this group + # continue + + if name in dimensions: + # Already seen this dim in this group continue +# basename = name.split(group_separator)[-1] + +# # Check for basename in parnet groups, and if +# # it's not there then create the imension in +# # this group +# if name.startswith(group_separator): +# found = False +# for g in name.split(group_separator)[1:-1:-1]: +# if basename in self._group_to_dims.get(g, ()): +# found = True +# +# if found : +# continue +# if basename in dimensions: +# continue + zd = ZarrDimension(basename, size, group) dimensions[basename] = zd - self._zarr_dims[name] = zd # TODOZARR RESOLVE NAME? + self._zarr_dims[name] = zd self._zarr_dims.update(dimensions) - print(' self._zarr_dims =',self._zarr_dims) - - # Map this group's variables to their dimension + # print(' self._zarr_dims =',list(self._zarr_dims)) + + # Map this group's variable names to their dimension # objects for v in group.array_values(): dimension_names = v.metadata.dimension_names @@ -1869,13 +1937,16 @@ def _dimensions(self, group): # Scalar variable dimension_names = () - self._zarr_var_to_dims[v.path] = [ + self._var_to_dims[v.path] = [ self._zarr_dims[name] for name in dimension_names ] - print(" self._zarr_var_to_dims=", self._zarr_var_to_dims) - - print('current group dimensions', dimensions) + print(group_name) + # print(" self._var_to_dims=", self._var_to_dims) + # print(' current group dimensions', list(dimensions)) + self._group_to_dims.setdefault(group_name, {}) + self._group_to_dims[group_name].update(dimensions) + print(' self._group_to_dims =',self._group_to_dims) return dimensions def _variables(self, group): @@ -1886,7 +1957,7 @@ def _variables(self, group): :Parameters: group: - The group to inspect. + The group object. :Returns: @@ -1909,7 +1980,7 @@ def _child_groups(self, group): :Parameters: group: - The group to inspect. + The group object. :Returns: @@ -1949,7 +2020,8 @@ def _backend(self, dataset=None): if dataset == "output": return "netCDF4" - raise("Bad value of 'dataset'") + raise ("Bad value of 'dataset'") + class AttributeParsingException(Exception): """Exception for unparsable attribute. diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index c94a9787f..09c34d5d5 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -305,7 +305,7 @@ def test_groups_geometry(self): g.nc_set_component_variable("interior_ring", "interior_ring") g.nc_set_component_variable_groups("interior_ring", ["forecast"]) - grouped_file='grouped_file.nc' + grouped_file = "grouped_file.nc" cfdm.write(g, grouped_file) # Check that the variable is in the right group diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index 6eb34c23d..02e3c2ced 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -182,7 +182,9 @@ def test_zarr_groups(self): # Set some groups f.nc_set_variable_groups(["forecast", "model"]) - f.construct("grid_latitude").bounds.nc_set_variable_groups(["forecast"]) + f.construct("grid_latitude").bounds.nc_set_variable_groups( + ["forecast"] + ) for name in ( "longitude", # Auxiliary coordinate "latitude", # Auxiliary coordinate @@ -195,11 +197,11 @@ def test_zarr_groups(self): "grid_latitude", # Dimension coordinate ): f.construct(name).nc_set_variable_groups(["forecast"]) - + # Check the groups - cfdm.write(f, grouped_file, fmt='NETCDF4') - cfdm.write(f, grouped_dir, fmt='ZARR3') - + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") + n = cfdm.read(grouped_file)[0] z = cfdm.read(grouped_dir)[0] self.assertTrue(z.equals(n)) @@ -207,13 +209,13 @@ def test_zarr_groups(self): # Directly check the groups in the Zarr dataset x = zarr.open(grouped_dir) - self.assertEqual(list(x.group_keys()), ['forecast']) - self.assertEqual(list(x['forecast'].group_keys()), ['model']) + self.assertEqual(list(x.group_keys()), ["forecast"]) + self.assertEqual(list(x["forecast"].group_keys()), ["model"]) - cfdm.write(z, tmpdir2, fmt='ZARR3') + cfdm.write(z, tmpdir2, fmt="ZARR3") z1 = cfdm.read(tmpdir2)[0] - self.assertTrue(z1.equals(f)) - + self.assertTrue(z1.equals(f)) + def test_zarr_groups_dimension(self): """Test the dimensions of Zarr hierarchical groups.""" f = self.f0.copy() @@ -236,8 +238,8 @@ def test_zarr_groups_dimension(self): domain_axis.nc_set_dimension_groups(["forecast"]) # Check the groups - cfdm.write(f, grouped_file, fmt='NETCDF4') - cfdm.write(f, grouped_dir, fmt='ZARR3') + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") n = cfdm.read(grouped_file)[0] z = cfdm.read(grouped_dir)[0] @@ -248,68 +250,46 @@ def test_zarr_groups_compression(self): """Test the compression of Zarr hierarchical groups.""" f = cfdm.example_field(4) - grouped_dir = 'tmpdir1' - grouped_file = 'tmpfile1.nc' + grouped_dir = "tmpdir1" + grouped_file = "tmpfile1.nc" f.compress("indexed_contiguous", inplace=True) f.data.get_count().nc_set_variable("count") f.data.get_index().nc_set_variable("index") - # Set some groups. (Write the read the field first to create # the compressions variables on disk.) cfdm.write(f, tmpfile2) f = cfdm.read(tmpfile2)[0] - - # ------------------------------------------------------------ - # Move the field construct to the /forecast/model group - # ------------------------------------------------------------ - f.nc_set_variable_groups(["forecast", "model"]) - # ------------------------------------------------------------ - # Move the count variable to the /forecast group - # ------------------------------------------------------------ + # Set some groups + f.nc_set_variable_groups(["forecast", "model"]) f.data.get_count().nc_set_variable_groups(["forecast"]) - - # ------------------------------------------------------------ - # Move the index variable to the /forecast group - # ------------------------------------------------------------ -# f.data.get_index().nc_set_variable_groups(["forecast"]) - - # ------------------------------------------------------------ - # Move the coordinates that span the element dimension to the - # /forecast group - # ------------------------------------------------------------ + f.data.get_index().nc_set_variable_groups(["forecast"]) f.construct("altitude").nc_set_variable_groups(["forecast"]) - - # ------------------------------------------------------------ - # Move the sample dimension to the /forecast group - # ------------------------------------------------------------ f.data.get_count().nc_set_sample_dimension_groups(["forecast"]) - print(f) - cfdm.write(f, grouped_file, fmt='NETCDF4') - cfdm.write(f, grouped_dir, fmt='ZARR3') + + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") n = cfdm.read(grouped_file) - z = cfdm.read(grouped_dir) - print(n) - print (z) + z = cfdm.read(grouped_dir, verbose=-1) n = n[0] z = z[0] - self.assertTrue(z.equals(n, verbose=-1)) + self.assertTrue(z.equals(n)) self.assertTrue(z.equals(f)) def test_zarr_groups_geometry(self): - """Test that geometries are considered in the correct Zarr groups.""" + """Test that geometries in Zarr groups.""" f = cfdm.example_field(6) grouped_dir = tmpdir1 - grouped_file = tmpfile1.nc - + grouped_file = tmpfile1 + cfdm.write(f, tmpfile2) f = cfdm.read(tmpfile2)[0] - + # Set some groups f.nc_set_variable_groups(["forecast", "model"]) f.nc_set_geometry_variable_groups(["forecast"]) @@ -320,8 +300,8 @@ def test_zarr_groups_geometry(self): f.nc_set_component_variable_groups("interior_ring", ["forecast"]) # Check the groups - cfdm.write(f, grouped_file, fmt='NETCDF4') - cfdm.write(f, grouped_dir, fmt='ZARR3') + cfdm.write(f, grouped_file, fmt="NETCDF4") + cfdm.write(f, grouped_dir, fmt="ZARR3") n = cfdm.read(grouped_file)[0] z = cfdm.read(grouped_dir)[0] @@ -329,7 +309,6 @@ def test_zarr_groups_geometry(self): self.assertTrue(z.equals(f)) - if __name__ == "__main__": print("Run date:", datetime.datetime.now()) cfdm.environment() From b72fb6a4062546696ef1df9ddcb69bb5a5cd6d29 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 31 Aug 2025 18:13:27 +0100 Subject: [PATCH 14/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 398 ++++++++++++++++------ 1 file changed, 289 insertions(+), 109 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index f22cb788b..9c54a2f85 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -588,7 +588,7 @@ def get_dims(self, variable): return [dims[name] for name in variable.dimensions] case "zarr": - return tuple(self._var_to_dims[variable.path]) + return tuple(self._var_to_dims[variable.name]) def getncattr(self, x, attr): """Retrieve a netCDF attribute. @@ -873,7 +873,7 @@ def flatten_dimension(self, dim): if self._debug: logger.debug( f" Creating dimension {new_name!r} from " - f"group {self.path(group)}" + f"group {self.path(group)!r}" ) # pragma: no cover # Write dimension @@ -939,6 +939,7 @@ def flatten_variable(self, var): else: fill_value = attributes.pop("_FillValue", None) + # print ( new_name, self.get_dims(var), new_dims, self.chunksizes(var)) new_var = self._output_ds.createVariable( new_name, self.dtype(var), @@ -1358,7 +1359,7 @@ def search_by_relative_path(self, ref, current_group, search_dim): """ # Go up parent groups - while ref.startswith("../"): + while ref.startswith(f"..{group_separator}"): parent = self.parent(current_group) if parent is None: return @@ -1670,7 +1671,7 @@ def pathname(self, group, name=None): """ if self.parent(group) is None: return group_separator + name - + return group_separator.join((self.path(group), name)) def generate_mapping_str(self, input_group, name, new_name): @@ -1834,7 +1835,8 @@ def _dimensions(self, group): :Returns: `dict`-like - The dimensions, keyed by their names. + The dimensions defined in the group, keyed by the + group name. """ match self._backend(): @@ -1843,111 +1845,12 @@ def _dimensions(self, group): case "zarr": group_name = self.path(group) - # print('Group:', repr( group_name )) - dimensions = self._group_to_dims.get(group_name) - if dimensions is not None: - # We've already found this group's dimensions - return dimensions - - from ..zarr import ZarrDimension + if group_name not in self._group_to_dims: + # Populate the `_group_to_dims` dictionary (we + # should only to this once per call of `flatten`) + self._populate_group_to_dims(group) - if not hasattr(self, "_zarr_dims"): - # Mapping of dimension names *as they appear in - # the Zarr dataset* to Dimension objects. - # - # E.g. {'x': , - # 'y': , - # 'bounds2': , - # '/forecast/y': } - self._zarr_dims = {} - - # Loop round this group's variables, finding the - # dimension_names for each one. - dimensions = {} - for v in group.array_values(): - dimension_names = v.metadata.dimension_names - if dimension_names is None: - # Scalar variable - continue - - for name, size in zip(dimension_names, v.shape): - if name in self._zarr_dims: - continue - - name_split = name.split(group_separator) - basename = name_split[-1] - - #if group_separator in name: - # if name.startswith(group_separator): - # g = group_separator.join(name_split[:-1]) - # if g == group_name: - # # Dimension is defined in the - # # current group - # name = basename - # else: - # for i in range(2, ???): - # g = group_separator.join(name_split[:-i]) - # if g in self._group_to_dims and basename in self._group_to_dims.setdefault(, {}): - # - # - # - # - # if dim_group is this group: - # name = basename - # elif dim exissts in dim_group: - # continue - # - #if name == basename and name in dimensions: - # # Already seen this dim in this group - # continue - - if name in dimensions: - # Already seen this dim in this group - continue - -# basename = name.split(group_separator)[-1] - -# # Check for basename in parnet groups, and if -# # it's not there then create the imension in -# # this group -# if name.startswith(group_separator): -# found = False -# for g in name.split(group_separator)[1:-1:-1]: -# if basename in self._group_to_dims.get(g, ()): -# found = True -# -# if found : -# continue -# if basename in dimensions: -# continue - - zd = ZarrDimension(basename, size, group) - dimensions[basename] = zd - self._zarr_dims[name] = zd - - self._zarr_dims.update(dimensions) - - # print(' self._zarr_dims =',list(self._zarr_dims)) - - # Map this group's variable names to their dimension - # objects - for v in group.array_values(): - dimension_names = v.metadata.dimension_names - if dimension_names is None: - # Scalar variable - dimension_names = () - - self._var_to_dims[v.path] = [ - self._zarr_dims[name] for name in dimension_names - ] - - print(group_name) - # print(" self._var_to_dims=", self._var_to_dims) - # print(' current group dimensions', list(dimensions)) - self._group_to_dims.setdefault(group_name, {}) - self._group_to_dims[group_name].update(dimensions) - print(' self._group_to_dims =',self._group_to_dims) - return dimensions + return self._group_to_dims[group_name] def _variables(self, group): """Return variables that are defined in this group. @@ -1972,6 +1875,273 @@ def _variables(self, group): case "zarr": return dict(group.arrays()) + def _populate_group_to_dims(self, group): + """Populate the `self._group_to_dims` dictionary. + + For the given group and all of its child groups, a mapping of + full-path group names to the unique dimensions implied by the + varibles therein will be added to `self._group_to_dims`. For + instance:: + + {'/': {'feature': , + 'station': }, + '/forecast': {'element': }, + '/forecast/model': {}} + + **Zarr** + + This is only required for a Zarr grouped dataset, for which + this information is not explicitly defined in the format's + data model (unlike for netCDF-3 and netCDF-4 datasets). + + To create the mapping, we need to impose an understanding of + CF dimensions and groups onto the contents of the Zarr + dataset: + + * A dimension name which contains no '/' (group separator) + characters is assumed assumed to be the same logical + dimension object as one with the same name and same size in + one of its parent groups, if one exists. When multiple + parents contain the dimension name, the parent closest to + the root dimension is used. + + * An exception is raised if a dimension name ends with '/'. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + group: + The group object. + + :Returns: + + `None` + + """ + from ..zarr import ZarrDimension + + group_name = self.path(group) + + input_ds = self._input_ds + var_to_dims = self._var_to_dims + + group_to_dims = self._group_to_dims + group_to_dims.setdefault(group_name, {}) + + # Loop over variables in this group, sorted by variable name. + for v in dict(sorted(group.arrays())).values(): + dimension_names = v.metadata.dimension_names + if dimension_names is None: + # Scalar variable has no dimensions + var_to_dims[v.name] = [] + continue + + # Loop over this variable's dimension names + for name, size in zip(dimension_names, v.shape): + name_split = name.split(group_separator) + basename = name_split[-1] + + if group_separator not in name: + # ------------------------------------------------ + # Relative path dimension name which contains no + # '/' characters and which has no upward path + # traversals + # + # E.g. "dim" + # + # We're looking for a dimension with the same name + # and same size in one of its parent groups, if + # one exists. If multiple parents contain it, the + # parent closest to the root dimension is chosen. + # ------------------------------------------------ + found_dim_in_parent = False + group_split = group_name.split(group_separator) + for n in range(1, len(group_split)): + parent_group = input_ds[ + group_separator.join(group_split)[:n] + ] + g = self.path(parent_group) + # Loop over variables in the parent group, + # sorted by variable name. + for parent_v in dict( + sorted(parent_group.arrays()) + ).values(): + dimensions2 = parent_v.metadata.dimension_names + if dimensions2 is None or name not in dimensions2: + continue + + zd = group_to_dims[g].get(basename) + if zd is not None and zd.size == size: + # Dimension 'basename' is already + # defined in 'parent_group' + found_dim_in_parent = True + break + + if found_dim_in_parent: + # Dimension 'basename' is already defined + # in a parent group + break + + if not found_dim_in_parent: + # Dimension 'basename' could not be matched to + # any parent group dimensions, so it needs to + # be defined in 'group'. + g = group_name + + else: + g = group_separator.join(name_split[:-1]) + if name.startswith(group_separator): + # -------------------------------------------- + # Absolute path dimension name + # + # E.g. "/group1/group2/dim" + # E.g. "/dim" + # -------------------------------------------- + if g == "": + g = group_separator + + elif name.endswith(group_separator): + # -------------------------------------------- + # E.g. g = "/group1/group2/" + # -------------------------------------------- + raise DimensionParsingException( + "Dimension names can not end with the group " + f"separator {group_separator!r}: {name!r} " + f"(group {group_name!r}, variable {v.name!r})" + ) + + elif name.startswith(f"..{group_separator}"): + # -------------------------------------------- + # Relative path dimension name with upward + # path traversals at the start of the name + # + # E.g. "../group1/group2/dim" + # E.g. "../../group1/group2/dim" + # -------------------------------------------- + current_group = group + while g.startswith(f"..{group_separator}"): + parent_group = self.parent(current_group) + current_group = parent_group + g = g[3:] + if parent_group is None: + # We've gone beyond the root group! + raise DimensionParsingException( + "Unresolvable upward path traversals " + f"in dimension name: {name!r} " + f"(group {group_name!r}, " + f"variable {v.name!r})" + ) + + g = group_separator.join((self.path(current_group), g)) + + elif ".." in name_split[:-1]: + # -------------------------------------------- + # Relative path dimension name with upward + # path traversals not at the start of the name + # + # E.g. "/group1/../group2/dim" + # E.g. "../group1/../group2/dim" + # E.g. "../group1/../group2/../dim" + # -------------------------------------------- + current_group = group + while ".." in name_split[:-1]: + index = name_split.index("..") + parent_group = self.parent(current_group) + name_split[index] = + current_group = parent_group + g = g[3:] + if parent_group is None: + # We've gone beyond the root group! + raise DimensionParsingException( + "Unresolvable upward path traversals " + f"in dimension name: {name!r} " + f"(group {group_name!r}, " + f"variable {v.name!r})" + ) + + g = group_separator.join((self.path(current_group), g)) + + +# elif name.startswith(f"..{group_separator}"): +# # -------------------------------------------- +# # Relative path dimension name with upward +# # path traversals at the start of the name +# # +# # E.g. "../group1/group2/dim" +# # E.g. "../../group1/group2/dim" +# # -------------------------------------------- +# current_group = group +# while g.startswith(f"..{group_separator}"): +# parent_group = self.parent(current_group) +# current_group = parent_group +# g = g[3:] +# if parent_group is None: +# # We've gone beyond the root group! +# raise DimensionParsingException( +# "Unresolvable upward path traversals " +# f"in dimension name: {name!r} " +# f"(group {group_name!r}, " +# f"variable {v.name!r})" +# ) +# +# g = group_separator.join((self.path(current_group), g)) +# +# elif f"..{group_separator}" in name: +# # -------------------------------------------- +# # Relative path dimension name with upward +# # path traversals not at the start of the name +# # +# # E.g. "/group1/../group2/dim" +# # E.g. "../group1/../group2/dim" +# # E.g. "../group1/../group2/../dim" +# # -------------------------------------------- +# raise DimensionParsingException( +# "In Zarr datasets, can't yet deal with a " +# "relative path dimension name with upward path " +# f"traversals in middle of the name: {name!r} " +# f"(group {group_name!r}, variable {v.name!r}). " +# "Please raise an issue at " +# "https://github.com/NCAS-CMS/cfdm/issues " +# "if you really do need this feature." +# ) + + else: + # -------------------------------------------- + # Relative path dimension name which contain + # '/' and which has no upward path traversals + # + # E.g. "group2/group3" + # -------------------------------------------- + g = group_separator.join((group_name, g)) + + if g in group_to_dims: + zd = group_to_dims[g].get(basename) + if zd is not None: + # Dimension 'basename' is already defined + if zd.size != size: + raise DimensionParsingException( + f"Dimension {name!r} of variable {v.name!r} " + f"has the wrong size ({size!r}). It should " + f"match the size of dimension {basename!r} " + f"in group {group_name!r} ({zd.size})" + ) + + var_to_dims.setdefault(v.name, []).append(zd) + continue + else: + group_to_dims[g] = {} + + # Still here? Then we're ready to define dimension + # 'basename' + zd = ZarrDimension(basename, size, input_ds[g]) + var_to_dims.setdefault(v.name, []).append(zd) + group_to_dims[g][basename] = zd + + # Recursively scan all child groups + for g in group.group_values(): + self._populate_group_to_dims(g) + def _child_groups(self, group): """Return groups that are defined in this group. @@ -2033,6 +2203,16 @@ class AttributeParsingException(Exception): pass +class DimensionParsingException(Exception): + """Exception for unparsable dimension. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + pass + + class UnresolvedReferenceException(Exception): """Exception for unresolvable references in attributes. From e96375a8e7ee47b99eae07c8eef9d3748cf4a9f6 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Sun, 31 Aug 2025 18:25:19 +0100 Subject: [PATCH 15/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 93 +++++------------------ 1 file changed, 20 insertions(+), 73 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 9c54a2f85..fabff6d60 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -2001,16 +2001,6 @@ def _populate_group_to_dims(self, group): if g == "": g = group_separator - elif name.endswith(group_separator): - # -------------------------------------------- - # E.g. g = "/group1/group2/" - # -------------------------------------------- - raise DimensionParsingException( - "Dimension names can not end with the group " - f"separator {group_separator!r}: {name!r} " - f"(group {group_name!r}, variable {v.name!r})" - ) - elif name.startswith(f"..{group_separator}"): # -------------------------------------------- # Relative path dimension name with upward @@ -2035,76 +2025,33 @@ def _populate_group_to_dims(self, group): g = group_separator.join((self.path(current_group), g)) - elif ".." in name_split[:-1]: + elif name.endswith(group_separator): + # -------------------------------------------- + # E.g. g = "/group1/group2/" + # -------------------------------------------- + raise DimensionParsingException( + "Dimension names can not end with the group " + f"separator {group_separator!r}: {name!r} " + f"(group {group_name!r}, variable {v.name!r})" + ) + + elif f"..{group_separator}" in name: # -------------------------------------------- # Relative path dimension name with upward # path traversals not at the start of the name # # E.g. "/group1/../group2/dim" # E.g. "../group1/../group2/dim" - # E.g. "../group1/../group2/../dim" # -------------------------------------------- - current_group = group - while ".." in name_split[:-1]: - index = name_split.index("..") - parent_group = self.parent(current_group) - name_split[index] = - current_group = parent_group - g = g[3:] - if parent_group is None: - # We've gone beyond the root group! - raise DimensionParsingException( - "Unresolvable upward path traversals " - f"in dimension name: {name!r} " - f"(group {group_name!r}, " - f"variable {v.name!r})" - ) - - g = group_separator.join((self.path(current_group), g)) - - -# elif name.startswith(f"..{group_separator}"): -# # -------------------------------------------- -# # Relative path dimension name with upward -# # path traversals at the start of the name -# # -# # E.g. "../group1/group2/dim" -# # E.g. "../../group1/group2/dim" -# # -------------------------------------------- -# current_group = group -# while g.startswith(f"..{group_separator}"): -# parent_group = self.parent(current_group) -# current_group = parent_group -# g = g[3:] -# if parent_group is None: -# # We've gone beyond the root group! -# raise DimensionParsingException( -# "Unresolvable upward path traversals " -# f"in dimension name: {name!r} " -# f"(group {group_name!r}, " -# f"variable {v.name!r})" -# ) -# -# g = group_separator.join((self.path(current_group), g)) -# -# elif f"..{group_separator}" in name: -# # -------------------------------------------- -# # Relative path dimension name with upward -# # path traversals not at the start of the name -# # -# # E.g. "/group1/../group2/dim" -# # E.g. "../group1/../group2/dim" -# # E.g. "../group1/../group2/../dim" -# # -------------------------------------------- -# raise DimensionParsingException( -# "In Zarr datasets, can't yet deal with a " -# "relative path dimension name with upward path " -# f"traversals in middle of the name: {name!r} " -# f"(group {group_name!r}, variable {v.name!r}). " -# "Please raise an issue at " -# "https://github.com/NCAS-CMS/cfdm/issues " -# "if you really do need this feature." -# ) + raise DimensionParsingException( + "In Zarr datasets, can't yet deal with a " + "relative path dimension name with upward path " + f"traversals in middle of the name: {name!r} " + f"(group {group_name!r}, variable {v.name!r}). " + "Please raise an issue at " + "https://github.com/NCAS-CMS/cfdm/issues " + "if you really do need this feature." + ) else: # -------------------------------------------- From 22d182e749b1f6070734eea469e19264b8ed5038 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 1 Sep 2025 14:38:11 +0100 Subject: [PATCH 16/39] dev --- cfdm/read_write/netcdf/flatten/flatten.py | 303 ++++++++++++++-------- cfdm/read_write/netcdf/netcdfwrite.py | 37 +-- cfdm/test/test_zarr.py | 4 +- 3 files changed, 217 insertions(+), 127 deletions(-) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index fabff6d60..19868abc6 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -86,8 +86,9 @@ def netcdf_flatten( :Parameters: input_ds: - The dataset to be flattened, that has the same API as - `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group` + The dataset to be flattened. Must be an object with the + the same API as `netCDF4.Dataset` or `h5netcdf.File`, or + else a `zarr.Group` object. output_ds: `netCDF4.Dataset` A container for the flattened dataset. @@ -257,8 +258,9 @@ def __init__( :Parameters: input_ds: - The dataset to be flattened, that has the same API as - `netCDF4.Dataset`, `h5netcdf.File`, or `zarr.Group`. + The dataset to be flattened. Must be an object with + the the same API as `netCDF4.Dataset` or + `h5netcdf.File`, or else a `zarr.Group` object. output_ds: `netCDF4.Dataset` A container for the flattened dataset. @@ -324,6 +326,10 @@ def __init__( # 'station': }, # '/forecast': {'element': }, # '/forecast/model': {}} + # + # Cuurently this mapping is only required for an input + # `zarr.Group` dataset, and is populated by + # `_populate_dimension_maps`. self._group_to_dims = {} # Mapping of variable names to their Dimension objects. @@ -333,6 +339,10 @@ def __init__( # ], # 'latitude_longitude': [], # 'forecast/y': []} + # + # Cuurently this mapping is only required for an input + # `zarr.Group` dataset, and is populated by + # `_populate_dimension_maps`. self._var_to_dims = {} self._input_ds = input_ds @@ -529,7 +539,7 @@ def filepath(self, dataset): :Parameters: dataset: - The dataset. + The dataset object. :Returns: @@ -1430,21 +1440,16 @@ def search_by_proximity( """ if search_dim: - # dims_or_vars = current_group.dimensions # TODOZARR dims_or_vars = self._dimensions(current_group) else: - # dims_or_vars = current_group.variables dims_or_vars = self._variables(current_group) # Found in current group - if ref in dims_or_vars: # .keys(): + if ref in dims_or_vars: return dims_or_vars[ref] - local_apex_reached = ( - # local_apex_reached or ref in current_group.dimensions.keys() - local_apex_reached - or ref - in dims_or_vars # TODOZARR self._dimensions(current_group).keys() + local_apex_reached = local_apex_reached or ref in self._dimensions( + current_group ) # Check if have to continue looking in parent group @@ -1452,10 +1457,8 @@ def search_by_proximity( # - coordinate variable: continue until local apex is reached parent_group = self.parent(current_group) if is_coordinate_variable: - # top_reached = local_apex_reached or current_group.parent is None top_reached = local_apex_reached or parent_group is None else: - # top_reached = current_group.parent is None top_reached = parent_group is None # Search up @@ -1472,7 +1475,6 @@ def search_by_proximity( # Coordinate variable and local apex reached, so search # down in siblings. found_elt = None - # for child_group in current_group.groups.values(): for child_group in self._child_groups(current_group).values(): found_elt = self.search_by_proximity( ref, @@ -1846,9 +1848,10 @@ def _dimensions(self, group): case "zarr": group_name = self.path(group) if group_name not in self._group_to_dims: - # Populate the `_group_to_dims` dictionary (we - # should only to this once per call of `flatten`) - self._populate_group_to_dims(group) + # Populate the `_group_to_dims` and `_var_to_dims` + # dictionaries (we only need do this once per call + # of `flatten`) + self._populate_dimension_maps(group) return self._group_to_dims[group_name] @@ -1875,37 +1878,111 @@ def _variables(self, group): case "zarr": return dict(group.arrays()) - def _populate_group_to_dims(self, group): - """Populate the `self._group_to_dims` dictionary. + def _populate_dimension_maps(self, group): + """Populate the dimension map dictionaries. For the given group and all of its child groups, a mapping of full-path group names to the unique dimensions implied by the - varibles therein will be added to `self._group_to_dims`. For + varibles therein will be added to `_group_to_dims`. For instance:: - {'/': {'feature': , - 'station': }, - '/forecast': {'element': }, + {'/': {}, + 'bounds2': , + 'x': }, + '/forecast': {'y': }, '/forecast/model': {}} + + For the given group and all of its child groups, a mapping of + full-path variables names to their dimensions will be added to + `_var_to_dims`. For instance:: + + {'/latitude_longitude': [], + '/x': [], + '/x_bnds': [ + ], + '/forecast/cell_measure': [, + ], + '/forecast/latitude': [, + ], + '/forecast/longitude': [, + ], + '/forecast/rotated_latitude_longitude': [], + '/forecast/time': [], + '/forecast/y': [], + '/forecast/y_bnds': [, + ], + '/forecast/model/ta': [, + ]} + **Zarr** - This is only required for a Zarr grouped dataset, for which - this information is not explicitly defined in the format's - data model (unlike for netCDF-3 and netCDF-4 datasets). + Populating the `_group_to_dims` dictionary is only + required for a Zarr grouped dataset, for which this + information is not explicitly defined in the format's data + model (unlike for netCDF-3 and netCDF-4 datasets). To create the mapping, we need to impose an understanding of CF dimensions and groups onto the contents of the Zarr - dataset: - - * A dimension name which contains no '/' (group separator) - characters is assumed assumed to be the same logical - dimension object as one with the same name and same size in - one of its parent groups, if one exists. When multiple - parents contain the dimension name, the parent closest to - the root dimension is used. - - * An exception is raised if a dimension name ends with '/'. + dataset. In CF, every dimension is explicitly declared in a + unique group, and that dimension may be referenced from + another group by applying one of the standardised CF search + algorithms (search by absolute path, search by relative path, + and search by proximity). + + In Zarr v3, a variable may name its dimensions, but there is + no "dimension class" in the data model that allows variables + and attributes to reference an explicit dimension defined in a + specific group. However, such a dimension class is required + when viewing the Zarr dataset with the netCDF data model, and + so a virtual "ZarrDimension" object that defines a dimension + in a unique group needs to be inferred. + + For dimension names that are absolute paths + (e.g. '/group1/dim') or relative paths that include a group + reference (e.g. 'group1/dim' or '../group1/dim') the group in + which to create the dimension is unambiguously defined. When a + dimension name has no path (e.g. 'dim') it may refer to a + logical dimension with that name in an ancestor group. In all + cases we have to consider the possibility that the size of + variable's dimension may not match the size of the + corresponding dimension ZarrDimension object. In this case, + depending on the nature of the Zarr dimension names, we must + either fail or else define a new ZarrDimension in a different + group. + + In this example, we must fail because only one ZarrDimension + object is implied ('dim' in group '/group1'), but the two + variables which require that dimension have different sizes (3 + and 999):: + + / + ├── variable1 (3,) (group/dim,) + ├── group1 + │ ├── variable2 (999,) (dim,) + + In this example we can reasonably create two ZarrDimension + objects: 'dim' in group '/' with size 3, and 'dim' in group + '/group1' with size 999:: + + / + ├── variable1 (3,) (dim,) + ├── group1 + │ ├── variable2 (999,) (dim,) + + In this example, we have no way of knowing if the 'dim' of + 'variable2' is the same logical dimension as the 'dim' of + 'variable1', or not:: + + / + ├── variable1 (3,) (dim,) + ├── group1 + │ ├── variable2 (3,) (dim,) + + Both options can be made to work, but this code always assumes + the former case, i.e. 'dim' of 'variable2' is the same logical + dimension as the 'dim' of 'variable1', and so only one + ZarrDimension object is created: 'dim' in group '/'. .. versionadded:: (cfdm) NEXTVERSION @@ -1933,6 +2010,11 @@ def _populate_group_to_dims(self, group): for v in dict(sorted(group.arrays())).values(): dimension_names = v.metadata.dimension_names if dimension_names is None: + if v.shape: + raise DimensionParsingException( + "Non-scalar variable has no dimension names: {v.name}" + ) + # Scalar variable has no dimensions var_to_dims[v.name] = [] continue @@ -1942,61 +2024,67 @@ def _populate_group_to_dims(self, group): name_split = name.split(group_separator) basename = name_split[-1] + # ---------------------------------------------------- + # Define 'g' as the absolute path name of the group in + # which to register the logical dimension object for + # this dimension 'name' + # ---------------------------------------------------- if group_separator not in name: # ------------------------------------------------ # Relative path dimension name which contains no - # '/' characters and which has no upward path - # traversals + # '/' characters. # # E.g. "dim" # - # We're looking for a dimension with the same name - # and same size in one of its parent groups, if - # one exists. If multiple parents contain it, the - # parent closest to the root dimension is chosen. + # Search by proximity for a dimension that already + # exists. # ------------------------------------------------ found_dim_in_parent = False group_split = group_name.split(group_separator) - for n in range(1, len(group_split)): - parent_group = input_ds[ - group_separator.join(group_split)[:n] - ] - g = self.path(parent_group) - # Loop over variables in the parent group, - # sorted by variable name. - for parent_v in dict( - sorted(parent_group.arrays()) - ).values(): - dimensions2 = parent_v.metadata.dimension_names - if dimensions2 is None or name not in dimensions2: - continue - - zd = group_to_dims[g].get(basename) - if zd is not None and zd.size == size: - # Dimension 'basename' is already - # defined in 'parent_group' - found_dim_in_parent = True - break - - if found_dim_in_parent: - # Dimension 'basename' is already defined - # in a parent group + for n in range(len(group_split) - 1, 0, -1): + g = group_separator.join(group_split[:n]) + if g == "": + g = group_separator + + zarr_dim = group_to_dims[g].get(basename) + if zarr_dim is not None and zarr_dim.size == size: + # Found a dimension in this parent group + # with the right name and size + found_dim_in_parent = True break if not found_dim_in_parent: # Dimension 'basename' could not be matched to # any parent group dimensions, so it needs to - # be defined in 'group'. + # be defined in the current group. g = group_name else: g = group_separator.join(name_split[:-1]) - if name.startswith(group_separator): + if name.endswith(group_separator): # -------------------------------------------- - # Absolute path dimension name + # Dimension name that ends with '/' + # + # E.g. g = "dim/" + # E.g. g = "group1/dim/" + # -------------------------------------------- + raise DimensionParsingException( + "Dimension names can't end with the group " + f"separator ({group_separator}): " + f"dimension_name={name}, variable={v.name}" + ) + + elif ( + name.startswith(group_separator) + and f"..{group_separator}" not in name + ): + # -------------------------------------------- + # Absolute path dimension name that starts + # with '/', and has no upward path traversals + # ('../'). # - # E.g. "/group1/group2/dim" # E.g. "/dim" + # E.g. "/group1/dim" # -------------------------------------------- if g == "": g = group_separator @@ -2004,7 +2092,8 @@ def _populate_group_to_dims(self, group): elif name.startswith(f"..{group_separator}"): # -------------------------------------------- # Relative path dimension name with upward - # path traversals at the start of the name + # path traversals ('../') at the start of the + # name # # E.g. "../group1/group2/dim" # E.g. "../../group1/group2/dim" @@ -2014,40 +2103,34 @@ def _populate_group_to_dims(self, group): parent_group = self.parent(current_group) current_group = parent_group g = g[3:] - if parent_group is None: - # We've gone beyond the root group! + if parent_group is None and g.startswith( + f"..{group_separator}" + ): + # We're about to go beyond the root + # group! raise DimensionParsingException( - "Unresolvable upward path traversals " - f"in dimension name: {name!r} " - f"(group {group_name!r}, " - f"variable {v.name!r})" + "Upward path traversals in dimension " + "name go beyond the root group: " + f"dimension_name={name}, variable={v.name}" ) g = group_separator.join((self.path(current_group), g)) - elif name.endswith(group_separator): - # -------------------------------------------- - # E.g. g = "/group1/group2/" - # -------------------------------------------- - raise DimensionParsingException( - "Dimension names can not end with the group " - f"separator {group_separator!r}: {name!r} " - f"(group {group_name!r}, variable {v.name!r})" - ) - elif f"..{group_separator}" in name: # -------------------------------------------- # Relative path dimension name with upward - # path traversals not at the start of the name + # path traversals ('../') not at the start of + # the name # # E.g. "/group1/../group2/dim" + # E.g. "group1/../group2/dim" # E.g. "../group1/../group2/dim" # -------------------------------------------- raise DimensionParsingException( "In Zarr datasets, can't yet deal with a " "relative path dimension name with upward path " - f"traversals in middle of the name: {name!r} " - f"(group {group_name!r}, variable {v.name!r}). " + f"traversals (../) in middle of the name: " + f"dimension_name={name}, variable={v.name}. " "Please raise an issue at " "https://github.com/NCAS-CMS/cfdm/issues " "if you really do need this feature." @@ -2055,39 +2138,39 @@ def _populate_group_to_dims(self, group): else: # -------------------------------------------- - # Relative path dimension name which contain + # Relative path dimension name which contains # '/' and which has no upward path traversals # - # E.g. "group2/group3" + # E.g. "group2/dim" # -------------------------------------------- g = group_separator.join((group_name, g)) if g in group_to_dims: - zd = group_to_dims[g].get(basename) - if zd is not None: - # Dimension 'basename' is already defined - if zd.size != size: + zarr_dim = group_to_dims[g].get(basename) + if zarr_dim is not None: + # Dimension 'basename' is already registered + # in group 'g' + if zarr_dim.size != size: raise DimensionParsingException( - f"Dimension {name!r} of variable {v.name!r} " - f"has the wrong size ({size!r}). It should " - f"match the size of dimension {basename!r} " - f"in group {group_name!r} ({zd.size})" + f"Dimension {name} of variable {v.name} " + f"has the wrong size: {size}. It should " + f"have the size {zarr_dim.size}" ) - var_to_dims.setdefault(v.name, []).append(zd) + var_to_dims.setdefault(v.name, []).append(zarr_dim) continue - else: - group_to_dims[g] = {} + # else: + # group_to_dims[g] = {} # Still here? Then we're ready to define dimension - # 'basename' - zd = ZarrDimension(basename, size, input_ds[g]) - var_to_dims.setdefault(v.name, []).append(zd) - group_to_dims[g][basename] = zd + # 'basename' as a ZarrDimension object. + zarr_dim = ZarrDimension(basename, size, input_ds[g]) + var_to_dims.setdefault(v.name, []).append(zarr_dim) + group_to_dims[g][basename] = zarr_dim # Recursively scan all child groups for g in group.group_values(): - self._populate_group_to_dims(g) + self._populate_dimension_maps(g) def _child_groups(self, group): """Return groups that are defined in this group. diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 1ef2c47b8..5244961d9 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2679,9 +2679,8 @@ def _createVariable(self, **kwargs): "storage_options": g.get("storage_options"), "overwrite": g["overwrite"], } - print("zarr_kwargs = ", zarr_kwargs) + variable = g["dataset"].create_array(**zarr_kwargs) - print("___________") g["nc"][ncvar] = variable @@ -3385,6 +3384,8 @@ def _write_data( # ------------------------------------------------------------ # Still here? The write a normal (non-aggregation) variable # ------------------------------------------------------------ + zarr = g["backend"] == "zarr" + if compressed: # Write data in its compressed form data = data.source().source() @@ -3407,16 +3408,23 @@ def _write_data( meta=np.array((), dx.dtype), ) - # If a Zarr variable is sharded, then rechunk the Dask array - # to the shards, because "when writing data, a full shard must - # be written in one go for optimal performance and to avoid - # concurrency issues." - # (https://zarr.readthedocs.io/en/stable/user-guide/arrays.html). - if g["backend"] == "zarr": + # Initialise the file lock for the data writing from Dask + lock = None + + # Rechunk the Dask array to shards, if applicable. + if zarr: + # When a Zarr variable is sharded, the Dask array must be + # rechunked to the shards because "when writing data, a + # full shard must be written in one go for optimal + # performance and to avoid concurrency issues." + # https://zarr.readthedocs.io/en/stable/user-guide/arrays.html#sharding shards = g["nc"][ncvar].shards if shards is not None: - print(f"Zarr: rechunking to shards {shards} from {dx.chunks}") dx = dx.rechunk(shards) + # This rechunking has aligned Dask chunk boundaries + # with Zarr chunk boundaries, so we don't need to lock + # the write. + lock = False # Check for out-of-range values if g["warn_valid"]: @@ -3432,7 +3440,7 @@ def _write_data( meta=np.array((), dx.dtype), ) - if g["backend"] == "zarr": + if zarr: # `zarr` can't write a masked array to a variable, so we # have to replace missing data with the fill value. dx = dx.map_blocks( @@ -3441,11 +3449,10 @@ def _write_data( fill_value=g["nc"][ncvar].fill_value, ) - # try: - # except AttributeError: - # print ('chunks:', g["nc"][ncvar].chunking()) - - from ...data.locks import netcdf_lock as lock + if lock is None: + # We need to define the file lock for data writing from + # Dask + from ...data.locks import netcdf_lock as lock da.store( dx, g["nc"][ncvar], compute=True, return_stored=False, lock=lock diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index 02e3c2ced..219243cb4 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -155,7 +155,7 @@ def test_zarr_read_write_CFA(self): self.assertTrue(z.equals(f)) self.assertTrue(z.equals(n)) - def test_zarr_groups(self): + def test_zarr_groups_1(self): """Test for the general handling of Zarr hierarchical groups.""" f = cfdm.example_field(1) @@ -273,7 +273,7 @@ def test_zarr_groups_compression(self): cfdm.write(f, grouped_dir, fmt="ZARR3") n = cfdm.read(grouped_file) - z = cfdm.read(grouped_dir, verbose=-1) + z = cfdm.read(grouped_dir) n = n[0] z = z[0] From 5ac1dde4b48abbcb8d7037ff20a65164567fa2ec Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 1 Sep 2025 18:58:31 +0100 Subject: [PATCH 17/39] dev --- cfdm/cfdmimplementation.py | 21 +++ cfdm/data/data.py | 4 +- cfdm/docstring/docstring.py | 49 ++++++- cfdm/mixin/netcdf.py | 164 +++++++++++----------- cfdm/read_write/netcdf/flatten/flatten.py | 2 + cfdm/read_write/netcdf/netcdfread.py | 109 +++++++++++--- cfdm/read_write/read.py | 6 + cfdm/read_write/write.py | 37 ++++- cfdm/test/test_Data.py | 2 +- cfdm/test/test_zarr.py | 4 +- 10 files changed, 286 insertions(+), 112 deletions(-) diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 669e62266..19aa2011a 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1262,6 +1262,27 @@ def nc_set_hdf5_chunksizes(self, data, chunksizes): "Use 'nc_set_dataset_chunksizes' instead." ) + def nc_set_dataset_shards(self, data, shards): + """Set the dataset sharding strategy for the data. + + ..versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + data: `Data` + + shards: `None` or `int` or sewunce of `int` + Set the sharding strategy when writing to a Zarr + dataset. + + :Returns: + + `None` + + """ + print(shards) + return data.nc_set_dataset_shards(shards) + def parameters(self, parent): """Return all parameters from a component. diff --git a/cfdm/data/data.py b/cfdm/data/data.py index e0bb7721b..fed3b6e3c 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -28,7 +28,7 @@ ) from ..mixin.container import Container from ..mixin.files import Files -from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks, ZarrShards +from ..mixin.netcdf import NetCDFAggregation, NetCDFChunks, NetCDFShards from ..units import Units from .abstract import Array from .creation import to_dask @@ -56,7 +56,7 @@ class Data( - Container, NetCDFAggregation, NetCDFChunks, ZarrShards, Files, core.Data + Container, NetCDFAggregation, NetCDFChunks, NetCDFShards, Files, core.Data ): """An N-dimensional data array with units and masked values. diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 0ddcd32e3..da9567342 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -542,7 +542,6 @@ None}``.""", # read store_dataset_chunks "{{read store_dataset_chunks: `bool`, optional}}": """store_dataset_chunks: `bool`, optional - If True (the default) then store the dataset chunking strategy for each returned data array. The dataset chunking strategy is then accessible via an object's @@ -555,13 +554,30 @@ chunking (such as a netCDF-3 dataset), then no dataset chunking strategy is stored (i.e. an `nc_dataset_chunksizes` method will return `None` for all - `Data` objects). In this case, when the data is written to - a new netCDF file, the dataset chunking strategy will be - determined by `{{package}}.write`. + returned `Data` objects). In this case, when the data is + written to a new netCDF file, the dataset chunking + strategy will be determined by `{{package}}.write`. See the `{{package}}.write` *dataset_chunks* parameter for details on how the dataset chunking strategy is determined at the time of writing.""", + # read store_dataset_shards + "{{read store_dataset_shards: `bool`, optional}}": """store_dataset_shards: `bool`, optional + If True (the default) then store the dataset sharding + strategy for each returned data array. The dataset + sharding strategy is then accessible via an object's + `nc_dataset_shards` method. When the dataset sharding + strategy is stored, it will be used when the data is + written to a new Zarr dataset with `{{package}}.write` + (unless the strategy is modified prior to writing). + + If False, or if the dataset being read does not support + sharding (such as a netCDF-4 dataset), then no dataset + sharding strategy is stored (i.e. an `nc_dataset_shards` + method will return `None` for all returned `Data` + objects). In this case, when the data is written to a new + Zarr dataset, the dataset shardinging strategy will be + determined by `{{package}}.write`.""", # read cfa "{{read cfa: `dict`, optional}}": """cfa: `dict`, optional Configure the reading of CF-netCDF aggregation files. @@ -1319,6 +1335,17 @@ names, are normalised to absolute paths prior to the replacement. If False (the default) then no normalisation is done.""", + # sharding + "{{sharding description}}": """When writing to a Zarr dataset, sharding provides a mechanism + to store multiple chunks in a single storage object or + file. This can be useful because traditional file systems and + object storage systems may have performance issues storing and + accessing many files. Additionally, small files can be + inefficient to store if they are smaller than the block size + of the file system. + + The sharding strategy is ignored when writing to a non-Zarr + datset.""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- @@ -1346,4 +1373,18 @@ "{{Returns original filenames}}": """The original file names in normalised absolute form. If there are no original files then an empty `set` will be returned.""", + # sharding options + "{{sharding options}}": """* `None` + + No sharding. + + * `int` + + The integer number of chunks to be stored in a + single shard, favouring an equal number of chunks + along each shard dimenson. + + * sequence of `int` + + The number of chunks along each shard dimension.""", } diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index 0d6c85f86..11b53810f 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -2545,7 +2545,7 @@ def nc_dataset_chunksizes(self, todict=False): .. seealso:: `nc_clear_dataset_chunksizes`, `nc_set_dataset_chunksizes`, `{{package}}.read`, - `{{package}}.write` + `nc_dataset_shards`, `{{package}}.write` :Parameters: @@ -2616,7 +2616,7 @@ def nc_clear_dataset_chunksizes(self): .. seealso:: `nc_dataset_chunksizes`, `nc_set_dataset_chunksizes`, `{{package}}.read`, - `{{package}}.write` + `nc_dataset_shards`, `{{package}}.write` :Returns: @@ -2666,7 +2666,8 @@ def nc_set_dataset_chunksizes(self, chunksizes): .. seealso:: `nc_dataset_chunksizes`, `nc_clear_dataset_chunksizes`, - `{{package}}.read`, `{{package}}.write` + `nc_dataset_shards`, `{{package}}.read`, + `{{package}}.write` :Parameters: @@ -5131,51 +5132,57 @@ def nc_set_aggregation_write_status(self, status): self._nc_set_aggregation_write_status(status) -class ZarrShards(NetCDFMixin): +class NetCDFShards(NetCDFMixin): """Mixin class for accessing dataset shard size. + When writing to a Zarr dataset, sharding provides a mechanism to + store multiple chunks in a single storage object or file. This can + be useful because traditional file systems and object storage + systems may have performance issues storing and accessing many + files. Additionally, small files can be inefficient to store if + they are smaller than the block size of the file system. + + The sharding strategy is ignored when writing to a non-Zarr + datset. + .. versionadded:: (cfdm) NEXTVERSION """ - def nc_dataset_shards(self, todict=False): + def nc_dataset_shards(self): """Get the dataset shard size for the data. + {{sharding description}} + .. versionadded:: (cfdm) NEXTVERSION .. seealso:: `nc_clear_dataset_shards`, - `nc_set_dataset_shards`, `{{package}}.write` - - :Parameters: - - {{chunk todict: `bool`, optional}} + `nc_set_dataset_shards`, `nc_dataset_chunksizes`, + `{{package}}.write` :Returns: - {{Returns nc_dataset_chunksizes}} + `None` or `int` or sequence of `int` + The current sharding strateg. One of: + + {{sharding options}} **Examples** >>> d.shape - (1, 96, 73) - >>> d.nc_set_dataset_chunksizes([1, 35, 73]) - >>> d.nc_dataset_chunksizes() - (1, 35, 73) - >>> d.nc_dataset_chunksizes(todict=True) - {0: 1, 1: 35, 2: 73} - >>> d.nc_clear_dataset_chunksizes() - (1, 35, 73) - >>> d.nc_dataset_chunksizes() - None - >>> d.nc_set_dataset_chunksizes('contiguous') - >>> d.nc_dataset_chunksizes() - 'contiguous' - >>> d.nc_set_dataset_chunksizes('1 KiB') - >>> d.nc_dataset_chunksizes() - 1024 - >>> d.nc_set_dataset_chunksizes(None) + (1, 100, 200) >>> d.nc_dataset_chunksizes() + (1, 30, 50) + >>> d.nc_set_dataset_shards(4) + >>> d.nc_dataset_shards() + 4 + >>> d.nc_clear_dataset_shards() + 4 + >>> print(d.nc_dataset_shards()) None + >>> d.nc_set_dataset_shards((5, 4)) + >>> d.nc_dataset_shards() + (5, 4) """ return self._get_netcdf().get("dataset_shards") @@ -5183,30 +5190,36 @@ def nc_dataset_shards(self, todict=False): def nc_clear_dataset_shards(self): """Clear the dataset shard size for the data. + {{sharding description}} + .. versionadded:: (cfdm) NEXTVERSION .. seealso:: `nc_dataset_shards`, `nc_set_dataset_shards`, - `{{package}}.write` + `nc_dataset_chunksizes`, `{{package}}.write` :Returns: - `None` or `str` or `int` or `tuple` of `int` - The chunking strategy prior to being cleared, as would - be returned by `nc_dataset_chunksizes`. + `None` or `int` or sequence of `int` + The cleared sharding strategy. One of: + + {{sharding options}} **Examples** >>> d.shape - (1, 96, 73) - >>> d.nc_set_dataset_chunksizes([1, 35, 73]) - >>> d.nc_clear_dataset_chunksizes() - (1, 35, 73) - >>> d.nc_set_dataset_chunksizes('1 KiB') - >>> d.nc_clear_dataset_chunksizes() - 1024 - >>> d.nc_set_dataset_chunksizes(None) - >>> print(d.nc_clear_dataset_chunksizes()) + (1, 100, 200) + >>> d.nc_dataset_chunksizes() + (1, 30, 50) + >>> d.nc_set_dataset_shards(4) + >>> d.nc_dataset_shards() + 4 + >>> d.nc_clear_dataset_shards() + 4 + >>> print(d.nc_dataset_shards()) None + >>> d.nc_set_dataset_shards((5, 4)) + >>> d.nc_dataset_shards() + (5, 4) """ return self._get_netcdf().pop("dataset_shards", None) @@ -5214,22 +5227,23 @@ def nc_clear_dataset_shards(self): def nc_set_dataset_shards(self, shards): """Set the dataset sharding strategy for the data. - The sharding strategy is either the integer number of chunks - stored in a single storage object (e.g. a file), or else - `None` to indicate that there is no sharding (i.e. each chunk - is stored in a different storage object). + {{sharding description}} .. versionadded:: (cfdm) NEXTVERSION .. seealso:: `nc_dataset_shards`, `nc_clear_dataset_shards`, - `{{package}}.write` + `nc_dataset_chunksizes`, `{{package}}.write` :Parameters: - {{chunk chunksizes}} + shards: `None` or `int` or sequence of `int` + The new sharding strategy. One of: - Each dictionary key is an integer that specifies an - axis by its position in the data array. + {{sharding options}} + + *Example:* + For two dimensional data, the following are + equivalent: ``25`` and ``(5, 5)``. :Returns: @@ -5238,34 +5252,22 @@ def nc_set_dataset_shards(self, shards): **Examples** >>> d.shape - (1, 96, 73) - >>> d.nc_set_dataset_chunksizes([1, 35, 73]) - >>> d.nc_dataset_chunksizes() - (1, 35, 73) - >>> d.nc_clear_dataset_chunksizes() - (1, 35, 73) + (1, 100, 200) >>> d.nc_dataset_chunksizes() + (1, 30, 50) + >>> d.nc_set_dataset_shards(4) + >>> d.nc_dataset_shards() + 4 + >>> d.nc_clear_dataset_shards() + 4 + >>> print(d.nc_dataset_shards()) None - >>> d.nc_set_dataset_chunksizes('contiguous') - >>> d.nc_dataset_chunksizes() - 'contiguous' - >>> d.nc_set_dataset_chunksizes('1 KiB') - >>> d.nc_dataset_chunksizes() - 1024 - >>> d.nc_set_dataset_chunksizes(None) - >>> d.nc_dataset_chunksizes() + >>> d.nc_set_dataset_shards((5, 4)) + >>> d.nc_dataset_shards() + (5, 4) + >>> d.nc_set_dataset_shards(None) + >>> print(d.nc_dataset_shards()) None - >>> d.nc_set_dataset_chunksizes([9999, -1, None]) - >>> d.nc_dataset_chunksizes() - (1, 96, 73) - >>> d.nc_clear_dataset_chunksizes() - (1, 96, 73) - >>> d.nc_set_dataset_chunksizes({1: 24}) - >>> d.nc_dataset_chunksizes() - (1, 24, 73) - >>> d.nc_set_dataset_chunksizes({0: None, 2: 50}) - >>> d.nc_dataset_chunksizes() - (1, 24, 50) """ if shards is None: @@ -5276,7 +5278,7 @@ def nc_set_dataset_shards(self, shards): if shards < 1: raise ValueError( f"'shards' must be None, a positive integer, or a " - f"sequence positive of integers. Got {shards!r}" + f"sequence positive of integers. Got: {shards!r}" ) self._set_netcdf("dataset_shards", shards) @@ -5287,21 +5289,21 @@ def nc_set_dataset_shards(self, shards): except TypeError: raise ValueError( f"'shards' must be None, a positive integer, or a " - f"sequence positive of integers. Got {shards!r}" + f"sequence positive of integers. Got: {shards!r}" ) - shape = self.shape - if len(shards) != len(shape): + if len(shards) != len(self.shape): raise ValueError( - f"When shards is a sequence {shards!r} then it must have the " - f"same length as the number of data dimensions ({len(shape)})" + f"When shards is a sequence it must have the same length as " + f"the number of data dimensions ({len(self.shape)}): " + f"Got: {shards!r} " ) for n, i in enumerate(shards): if not (isinstance(i, Integral) and i > 0): raise ValueError( f"Shard size for dimension position {n} must be " - f"a positive integer. Got {i!r}" + f"a positive integer. Got: {i!r}" ) self._set_netcdf("dataset_shards", shards) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 19868abc6..b319ab558 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -41,6 +41,8 @@ None: "native", } +# TODOZARR = _ARRAY_DIMENSIONS, parse and remove ! + # Set of netCDF attributes that contain references to dimensions or # variables referencing_attributes = set(flattening_rules) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index b3aec9bb3..249c9db76 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -472,17 +472,18 @@ def dataset_close(self): for flat_dataset in g["flat_datasets"]: flat_dataset.close() - if g["original_dataset_opened_with"] == "zarr": - # zarr: No need to close - return - - # netCDF4, h5netcdf for nc in g["datasets"]: - nc.close() + try: + nc.close() + except AttributeError: + pass # Close the original grouped file (v1.8.8.1) if "nc_grouped" in g: - g["nc_grouped"].close() + try: + g["nc_grouped"].close() + except AttributeError: + pass # Close s3fs.File objects for f in g["s3fs_File_objects"]: @@ -933,6 +934,7 @@ def read( cache=True, dask_chunks="storage-aligned", store_dataset_chunks=True, + store_dataset_shards=True, cfa=None, cfa_write=None, to_memory=None, @@ -1019,6 +1021,12 @@ def read( .. versionadded:: (cfdm) 1.12.0.0 + store_dataset_shards: `bool`, optional + Store the dataset sharding strategy. See `cfdm.read` + for details. + + .. versionadded:: (cfdm) NEXTVERSION + cfa: `dict`, optional Configure the reading of CF-netCDF aggregation datasets. See `cfdm.read` for details. @@ -1433,9 +1441,11 @@ def read( # Dask chunking of aggregated data for selected constructs "cfa_write": cfa_write, # -------------------------------------------------------- - # Whether or not to store the dataset chunking strategy + # Whether or not to store the dataset chunking and + # sharding strategy # -------------------------------------------------------- "store_dataset_chunks": bool(store_dataset_chunks), + "store_dataset_shards": bool(store_dataset_shards), # -------------------------------------------------------- # Constructs to read into memory # -------------------------------------------------------- @@ -8244,15 +8254,26 @@ def _create_Data( **kwargs, ) - # Store the dataset chunking - if self.read_vars["store_dataset_chunks"] and ncvar is not None: - # Only store the dataset chunking if 'data' has the same - # shape as its netCDF variable. This may not be the case - # for variables compressed by convention (e.g. some DSG - # variables). - chunks, shape = self._get_dataset_chunks(ncvar) - if shape == data.shape: - self.implementation.nc_set_dataset_chunksizes(data, chunks) + if ncvar is not None: + # Store the dataset chunking + if self.read_vars["store_dataset_chunks"]: + # Only store the dataset chunking if 'data' has the + # same shape as its netCDF variable. This may not be + # the case for variables compressed by convention + # (e.g. some DSG variables). + chunks, shape = self._get_dataset_chunks(ncvar) + if shape == data.shape: + self.implementation.nc_set_dataset_chunksizes(data, chunks) + + # Store the dataset sharding + if self.read_vars["store_dataset_shards"]: + # Only store the dataset sharding if 'data' has the + # same shape as its netCDF variable. This may not be + # the case for variables compressed by convention + # (e.g. some DSG variables). + shards, shape = self._get_dataset_shards(ncvar) + if shards is not None and shape == data.shape: + self.implementation.nc_set_dataset_shards(data, shards) return data @@ -10892,7 +10913,7 @@ def _dataset_has_groups(self, nc): return bool(nc.groups) case "zarr": - return bool(tuple(nc.groups())) + return bool(tuple(nc.group_keys())) # if self.read_vars["dataset_opened_with"] == "zarr": # return bool(tuple(nc.groups())) @@ -11215,9 +11236,10 @@ def _file_variable_attributes(self, var): case "zarr": attrs = dict(var.attrs) - if self.read_vars["original_dataset_opened_with"] == "zarr": - # zarr: Remove the _ARRAY_DIMENSIONS attribute - attrs.pop("_ARRAY_DIMENSIONS", None) # TODOZARR + + # Remove the _ARRAY_DIMENSIONS from Zarr v2 attributes + if var.info._zarr_format == 2: + attrs.pop("_ARRAY_DIMENSIONS", None) return attrs @@ -12132,3 +12154,48 @@ def _set_quantization(self, parent, ncvar): # Set the Quantization metadata self.implementation.set_quantization(parent, q, copy=False) + + def _get_dataset_shards(self, ncvar): + """Return a netCDF variable's dataset storage shards. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ncvar: `str` + The netCDF variable name. + + :Returns: + + 2-tuple: + The variable's sharding strategy and its shape. If the + dataset is not Zarr, then (`None`, `None`) is + returned. + + + **Examples** + + >>> n._get_dataset_shards('tas') + [1, 2, 3], (12, 324, 432) + >>> n._get_dataset_chunks('pr') + None, (12, 324, 432) + + """ + g = self.read_vars + if g["original_dataset_opened_with"] != "zarr": + # Only Zarr datasets have shards + return None, None + + if g["has_groups"]: + nc = g["nc_grouped"] + else: + nc = g["nc"] + + var = nc[ncvar] + shards = var.shards + if shards is not None: + # Re-cast 'shards' as the number of chunks (as opposed to + # data elemnents) along each of its dimensions + shards = [s // c for s, c in zip(shards, var.chunks)] + + return shards, var.shape diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index de08c30aa..669fdd6bf 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -170,6 +170,10 @@ class read(ReadWrite): .. versionadded:: (cfdm) 1.11.2.0 + {{read store_dataset_shards: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + {{read cfa: `dict`, optional}} .. versionadded:: (cfdm) 1.12.0.0 @@ -238,6 +242,7 @@ def __new__( cache=True, dask_chunks="storage-aligned", store_dataset_chunks=True, + store_dataset_shards=True, cfa=None, cfa_write=None, to_memory=False, @@ -564,6 +569,7 @@ def _read(self, dataset): "cache", "dask_chunks", "store_dataset_chunks", + "store_dataset_shards", "cfa", "cfa_write", "to_memory", diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index a9a5e6eaf..f6f354908 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -612,8 +612,41 @@ class write(ReadWrite): .. versionadded:: (cfdm) 1.12.0.0 - dataset_shards: `str` or `int` or `float`, optional - TODOZARR + dataset_shards: `None` or `int`, optional + When writing to a Zarr dataset, sharding provides a + mechanism to store multiple chunks in a single storage + object or file. This can be useful because traditional + file systems and object storage systems may have + performance issues storing and accessing many + files. Additionally, small files can be inefficient to + store if they are smaller than the block size of the file + system. + + The *dataset_shards* parameter is ignored when writing to + a non-Zarr datset. + + If any `Data` being written already stores its own dataset + sharding strategy (i.e. its `Data.nc_dataset_shards` + method returns something other than `None`) then, for that + data array alone, it is used in preference to the strategy + defined by the *dataset_shards* parameter. + + The *dataset_shards* parameter may be one of: + + * `None` + + No sharding. + + * `int` + + The integer number of chunks to be stored in a single + shard, favouring an equal number of chunks along each + shard dimenson. + + *Example:* + For two-dimensional `Data`, ``dataset_shards=9`` will + result in shards that span 3 chunks along each + dimension. cfa: `str` or `dict` or `None`, optional Specify which netCDF variables, if any, should be written diff --git a/cfdm/test/test_Data.py b/cfdm/test/test_Data.py index 2749be5d7..0dd45ea01 100644 --- a/cfdm/test/test_Data.py +++ b/cfdm/test/test_Data.py @@ -2864,7 +2864,7 @@ def test_Data_dataset_shards(self): self.assertIsNone(d.nc_dataset_shards()) self.assertIsNone(d.nc_set_dataset_shards(None)) - self.assertIsNone(d.nc_dataset_shards(None)) + self.assertIsNone(d.nc_dataset_shards()) self.assertIsNone(d.nc_set_dataset_shards(100)) self.assertEqual(d.nc_dataset_shards(), 100) diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index 219243cb4..082c94af3 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -116,8 +116,9 @@ def test_zarr_read_write_chunks_shards(self): # Make shards comprising 4 chunks cfdm.write(f, tmpdir1, fmt="ZARR3", dataset_shards=4) - z = cfdm.read(tmpdir1)[0] + z = cfdm.read(tmpdir1, store_dataset_shards=False)[0] self.assertTrue(z.equals(f)) + self.assertIsNone(z.data.nc_dataset_shards()) z = zarr.open(tmpdir1) self.assertEqual(z["q"].chunks, (2, 3)) @@ -128,6 +129,7 @@ def test_zarr_read_write_chunks_shards(self): cfdm.write(f, tmpdir1, fmt="ZARR3") z = cfdm.read(tmpdir1)[0] self.assertTrue(z.equals(f)) + self.assertEqual(z.data.nc_dataset_shards(), (2, 2)) z = zarr.open(tmpdir1) self.assertEqual(z["q"].chunks, (2, 3)) From 2158977bf74f4b3284d868b04a87c1255576c17d Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 2 Sep 2025 19:48:36 +0100 Subject: [PATCH 18/39] dev --- cfdm/cfdmimplementation.py | 1 - cfdm/read_write/netcdf/flatten/flatten.py | 568 +++++++++++++--------- cfdm/read_write/netcdf/netcdfread.py | 17 +- cfdm/read_write/netcdf/netcdfwrite.py | 411 ++++++++-------- cfdm/read_write/netcdf/zarr.py | 48 +- 5 files changed, 577 insertions(+), 468 deletions(-) diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 19aa2011a..214451d85 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1280,7 +1280,6 @@ def nc_set_dataset_shards(self, data, shards): `None` """ - print(shards) return data.nc_set_dataset_shards(shards) def parameters(self, parent): diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index b319ab558..bdae1b905 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -41,8 +41,6 @@ None: "native", } -# TODOZARR = _ARRAY_DIMENSIONS, parse and remove ! - # Set of netCDF attributes that contain references to dimensions or # variables referencing_attributes = set(flattening_rules) @@ -54,8 +52,9 @@ def netcdf_flatten( strict=True, omit_data=False, write_chunksize=134217728, + dimension_mode="ancestor", ): - """Create a flattened version of a grouped dataset. + """Create a flattened version of a grouped CF dataset. **CF-netCDF coordinate variables** @@ -93,7 +92,8 @@ def netcdf_flatten( else a `zarr.Group` object. output_ds: `netCDF4.Dataset` - A container for the flattened dataset. + A container for the flattened dataset that will get + updated in-place with the flattened input dataset. strict: `bool`, optional If True, the default, then failing to resolve a reference @@ -119,6 +119,44 @@ def netcdf_flatten( *input_ds* to *output_ds* for each piece. Ignored if *omit_data* is True. + dimension_mode: `str`, optional + How to interpret a dimension name that has no path, + i.e. one that contains no group-separator characters, such + as ``dim``, as opposed to ``group/dim`` or ``/group/dim``, + etc. + + This is only required for Zarr input datasets, for which + there is no means of indicating whether the same dimension + name that appears in different groups correspond to each + other, or not. + + For non-Zarr datasets that adhere to the netCDF data + model, *dimension_mode* is ignored because any + correspondence between dimensions is already explicitly + recorded in these datasets. + + The *dimension_mode* parameter must be one of: + + * ``'ancestor'`` + + This is the default. Assume that the dimension is the + same as one with the same name and size defined in an + ancestor group, if one exists. If multiple such + dimensions exist, then the correspondence is with the + dimension in the ancestor group that is furthest way + from the root group. + + * ``'local'`` + + Assume that the dimension is different to any with same + name defined in ancestor groups. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `None` + """ _Flattener( input_ds, @@ -126,6 +164,7 @@ def netcdf_flatten( strict, omit_data=omit_data, write_chunksize=write_chunksize, + dimension_mode=dimension_mode, ).flatten() @@ -243,6 +282,8 @@ class _Flattener: Contains the input file, the output file being flattened, and all the logic of the flattening process. + See `netcdf_flatten` for detais. + .. versionadded:: (cfdm) 1.11.2.0 """ @@ -254,6 +295,7 @@ def __init__( strict=True, omit_data=False, write_chunksize=134217728, + dimension_mode="ancestor", ): """**Initialisation** @@ -276,6 +318,11 @@ def __init__( write_chunksize: `int`, optional See `netcdf_flatten`. + dimension_mode: `str`, optional + See `netcdf_flatten`. + + .. versionadded:: (cfdm) NEXTVERSION + """ # Mapping of flattened attribute names to their full-path # counterparts. @@ -336,11 +383,11 @@ def __init__( # Mapping of variable names to their Dimension objects. # - # E.g. {'x': [], - # 'x_bnds': [, - # ], - # 'latitude_longitude': [], - # 'forecast/y': []} + # E.g. {'x': (,), + # 'x_bnds': (, + # ), + # 'latitude_longitude': (), + # 'forecast/y': (>> f.filepath(dataset) + >>> f.dataset_name() '/home/data/file.nc' """ + if dataset is None: + dataset = self._input_ds + match self._backend(): case "h5netcdf": return dataset.filename @@ -563,13 +625,18 @@ def filepath(self, dataset): return dataset.filepath() case "zarr": - return str(dataset.store_path) + return str(dataset.store) - def get_dims(self, variable): - """Return the dimensions associated with a variable. + def _variable_dimensions(self, variable): + """Return the dimension objects associated with a variable. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + variable: + The variable object. + :Returns: `list` of dimension objects @@ -600,7 +667,7 @@ def get_dims(self, variable): return [dims[name] for name in variable.dimensions] case "zarr": - return tuple(self._var_to_dims[variable.name]) + return self._var_to_dims[variable.name] def getncattr(self, x, attr): """Retrieve a netCDF attribute. @@ -628,6 +695,11 @@ def group(self, x): .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + x: + The variable or dimension object. + :Returns: `Group` @@ -673,14 +745,15 @@ def name(self, x, dataset=None): # Dimension return x.name.split(group_separator)[-1] - def ncattrs(self, x): - """Return netCDF attribute names. + def _attribute_names(self, x): + """Return attribute names of a variable, group, or dataset. .. versionadded:: (cfdm) 1.11.2.0 :Parameters: - x: variable, group, or dataset + x: + The variable, group, or dataset object :Returns: @@ -688,11 +761,23 @@ def ncattrs(self, x): """ match self._backend(): - case "h5netcdf" | "zarr": - return list(x.attrs) + case "h5netcdf": + attrs = list(x.attrs) case "netCDF4": - return x.ncattrs() + attrs = x.ncattrs() + + case "zarr": + attrs = dict(x.attrs) + + # Remove _ARRAY_DIMENSIONS from Zarr v2 variable + # attributes + if x.metadata.zarr_format == 2 and hasattr(x, "shape"): + attrs.pop("_ARRAY_DIMENSIONS", None) + + attrs = list(attrs) + + return attrs def parent(self, group): """Return the parent group. @@ -759,7 +844,7 @@ def flatten(self): if self._debug: logger.debug( - f"Flattening the groups of {self.filepath(input_ds)}" + f"Flattening the groups of {self.dataset_name()}" ) # pragma: no cover # Flatten product @@ -800,23 +885,15 @@ def process_group(self, input_group): f" Browsing group {self.path(input_group)}" ) # pragma: no cover - for attr_name in self.ncattrs(input_group): + for attr_name in self._attribute_names(input_group): self.flatten_attribute(input_group, attr_name) - # for dim in input_group.dimensions.values(): - # self.flatten_dimension(dim) - - for dim in self._dimensions(input_group).values(): + for dim in self._group_dimensions(input_group).values(): self.flatten_dimension(dim) - # for var in input_group.variables.values(): - # self.flatten_variable(var) - for var in self._variables(input_group).values(): + for var in self._group_variables(input_group).values(): self.flatten_variable(var) - # for child_group in input_group.groups.values(): - # self.process_group(child_group) - for child_group in self._child_groups(input_group).values(): self.process_group(child_group) @@ -872,11 +949,6 @@ def flatten_dimension(self, dim): `None` """ - # logger.debug( - # f" Creating dimension {self.name(dim)!r} from " - # f"group {self.path(self.group(dim))} to root" - # ) - # Create new name group = self.group(dim) name = self.name(dim) @@ -916,34 +988,29 @@ def flatten_variable(self, var): `None` """ - # logger.debug( - # f" Copying variable {self.name(var)!r} from " - # f"group {self.path(self.group(var))} to root" - # ) - # Create new name new_name = self.generate_flattened_name( self.group(var), self.name(var) ) + if self._debug: + logger.debug( + f" Creating variable {new_name!r} from " + f"{self.pathname(self.group(var), self.name(var))!r}" + ) # pragma: no cover + # Replace old by new dimension names new_dims = list( map( lambda x: self._dim_map[ self.pathname(self.group(x), self.name(x)) ], - self.get_dims(var), + self._variable_dimensions(var), ) ) # Write variable - if self._debug: - logger.debug( - f" Creating variable {new_name!r} from " - f"{self.pathname(self.group(var), self.name(var))!r}" - ) # pragma: no cover - - attributes = self.attrs(var) + attributes = self._variable_attrs(var) omit_data = self._omit_data if omit_data: @@ -951,7 +1018,6 @@ def flatten_variable(self, var): else: fill_value = attributes.pop("_FillValue", None) - # print ( new_name, self.get_dims(var), new_dims, self.chunksizes(var)) new_var = self._output_ds.createVariable( new_name, self.dtype(var), @@ -1327,7 +1393,7 @@ def resolve_reference_post_processing( and rules.limit_to_scalar_coordinates and ( ( - "coordinates" not in self.ncattrs(orig_var) + "coordinates" not in self._attribute_names(orig_var) or orig_ref not in self.getncattr(orig_var, "coordinates") ) or self._input_ds[absolute_ref].ndim > 0 @@ -1389,10 +1455,10 @@ def search_by_relative_path(self, ref, current_group, search_dim): # Get variable or dimension if search_dim: - elt = tuple(self._dimensions(current_group))[ref_split[-1]] + elt = tuple(self._group_dimensions(current_group))[ref_split[-1]] else: - elt = tuple(self._variables(current_group))[ref_split[-1]] + elt = tuple(self._group_variables(current_group))[ref_split[-1]] # Get absolute reference return self.pathname(self.group(elt), self.name(elt)) @@ -1442,16 +1508,16 @@ def search_by_proximity( """ if search_dim: - dims_or_vars = self._dimensions(current_group) + dims_or_vars = self._group_dimensions(current_group) else: - dims_or_vars = self._variables(current_group) + dims_or_vars = self._group_variables(current_group) # Found in current group if ref in dims_or_vars: return dims_or_vars[ref] - local_apex_reached = local_apex_reached or ref in self._dimensions( - current_group + local_apex_reached = ( + local_apex_reached or ref in self._group_dimensions(current_group) ) # Check if have to continue looking in parent group @@ -1515,7 +1581,7 @@ def resolve_references(self, var, old_var): `None` """ - var_attrs = self.attrs(var, "output") + var_attrs = self._variable_attrs(var, "output") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value parsed_attribute = parse_attribute(name, var_attrs[name]) @@ -1562,7 +1628,7 @@ def adapt_references(self, var): `None` """ - var_attrs = self.attrs(var, "output") + var_attrs = self._variable_attrs(var, "output") for name in referencing_attributes.intersection(var_attrs): # Parse attribute value value = var_attrs[name] @@ -1826,8 +1892,8 @@ def handle_reference_error(self, role, ref, context=None): warnings.warn(message) return f"{ref_not_found_error}_{ref}" - def _dimensions(self, group): - """Return dimensions that are defined in this group. + def _group_dimensions(self, group): + """Return dimensions that are defined in a group. .. versionadded:: (cfdm) NEXTVERSION @@ -1849,16 +1915,15 @@ def _dimensions(self, group): case "zarr": group_name = self.path(group) - if group_name not in self._group_to_dims: + if not self._group_to_dims and group_name == group_separator: # Populate the `_group_to_dims` and `_var_to_dims` - # dictionaries (we only need do this once per call - # of `flatten`) + # dictionaries if we're at the root group self._populate_dimension_maps(group) return self._group_to_dims[group_name] - def _variables(self, group): - """Return variables that are defined in this group. + def _group_variables(self, group): + """Return variables that are defined in a group. .. versionadded:: (cfdm) NEXTVERSION @@ -1899,92 +1964,32 @@ def _populate_dimension_maps(self, group): full-path variables names to their dimensions will be added to `_var_to_dims`. For instance:: - {'/latitude_longitude': [], - '/x': [], - '/x_bnds': [ - ], - '/forecast/cell_measure': [, - ], - '/forecast/latitude': [, - ], - '/forecast/longitude': [, - ], - '/forecast/rotated_latitude_longitude': [], - '/forecast/time': [], - '/forecast/y': [], - '/forecast/y_bnds': [, - ], - '/forecast/model/ta': [, - ]} - - **Zarr** - - Populating the `_group_to_dims` dictionary is only - required for a Zarr grouped dataset, for which this - information is not explicitly defined in the format's data - model (unlike for netCDF-3 and netCDF-4 datasets). - - To create the mapping, we need to impose an understanding of - CF dimensions and groups onto the contents of the Zarr - dataset. In CF, every dimension is explicitly declared in a - unique group, and that dimension may be referenced from - another group by applying one of the standardised CF search - algorithms (search by absolute path, search by relative path, - and search by proximity). - - In Zarr v3, a variable may name its dimensions, but there is - no "dimension class" in the data model that allows variables - and attributes to reference an explicit dimension defined in a - specific group. However, such a dimension class is required - when viewing the Zarr dataset with the netCDF data model, and - so a virtual "ZarrDimension" object that defines a dimension - in a unique group needs to be inferred. - - For dimension names that are absolute paths - (e.g. '/group1/dim') or relative paths that include a group - reference (e.g. 'group1/dim' or '../group1/dim') the group in - which to create the dimension is unambiguously defined. When a - dimension name has no path (e.g. 'dim') it may refer to a - logical dimension with that name in an ancestor group. In all - cases we have to consider the possibility that the size of - variable's dimension may not match the size of the - corresponding dimension ZarrDimension object. In this case, - depending on the nature of the Zarr dimension names, we must - either fail or else define a new ZarrDimension in a different - group. - - In this example, we must fail because only one ZarrDimension - object is implied ('dim' in group '/group1'), but the two - variables which require that dimension have different sizes (3 - and 999):: - - / - ├── variable1 (3,) (group/dim,) - ├── group1 - │ ├── variable2 (999,) (dim,) - - In this example we can reasonably create two ZarrDimension - objects: 'dim' in group '/' with size 3, and 'dim' in group - '/group1' with size 999:: - - / - ├── variable1 (3,) (dim,) - ├── group1 - │ ├── variable2 (999,) (dim,) - - In this example, we have no way of knowing if the 'dim' of - 'variable2' is the same logical dimension as the 'dim' of - 'variable1', or not:: - - / - ├── variable1 (3,) (dim,) - ├── group1 - │ ├── variable2 (3,) (dim,) - - Both options can be made to work, but this code always assumes - the former case, i.e. 'dim' of 'variable2' is the same logical - dimension as the 'dim' of 'variable1', and so only one - ZarrDimension object is created: 'dim' in group '/'. + {'/latitude_longitude': (), + '/x': (,), + '/x_bnds': ( + ), + '/forecast/cell_measure': (, + ), + '/forecast/latitude': (, + ), + '/forecast/longitude': (, + ), + '/forecast/rotated_latitude_longitude': (), + '/forecast/time': (), + '/forecast/y': (,), + '/forecast/y_bnds': (, + ), + '/forecast/model/ta': (, + )} + + **Zarr datasets** + + Populating the `_group_to_dims` dictionary is only required + for a Zarr grouped dataset, for which this information is not + explicitly defined in the format's data model (unlike for + netCDF-3 and netCDF-4 datasets). + + See `netcdf_flatten` for details .. versionadded:: (cfdm) NEXTVERSION @@ -2003,22 +2008,26 @@ def _populate_dimension_maps(self, group): group_name = self.path(group) input_ds = self._input_ds + group_to_dims = self._group_to_dims var_to_dims = self._var_to_dims - group_to_dims = self._group_to_dims + dimension_mode = self._dimension_mode + + # Initialise mapping from the group to its ZarrDimension + # objects. Use 'setdefault' because a previous call to + # `_populate_dimension_maps` might already have done this. group_to_dims.setdefault(group_name, {}) # Loop over variables in this group, sorted by variable name. for v in dict(sorted(group.arrays())).values(): - dimension_names = v.metadata.dimension_names - if dimension_names is None: - if v.shape: - raise DimensionParsingException( - "Non-scalar variable has no dimension names: {v.name}" - ) - - # Scalar variable has no dimensions - var_to_dims[v.name] = [] + # Initialise mapping from the variable to its + # ZarrDimension objects + var_name = v.name + var_to_dims[var_name] = () + + dimension_names = self._variable_dimension_names(v) + if not dimension_names: + # A scalar variable has no dimensions continue # Loop over this variable's dimension names @@ -2029,7 +2038,10 @@ def _populate_dimension_maps(self, group): # ---------------------------------------------------- # Define 'g' as the absolute path name of the group in # which to register the logical dimension object for - # this dimension 'name' + # this dimension. + # + # Which group is defined will depend on the nature of + # the dimension's 'name'. # ---------------------------------------------------- if group_separator not in name: # ------------------------------------------------ @@ -2037,59 +2049,88 @@ def _populate_dimension_maps(self, group): # '/' characters. # # E.g. "dim" - # - # Search by proximity for a dimension that already - # exists. # ------------------------------------------------ - found_dim_in_parent = False - group_split = group_name.split(group_separator) - for n in range(len(group_split) - 1, 0, -1): - g = group_separator.join(group_split[:n]) - if g == "": - g = group_separator - - zarr_dim = group_to_dims[g].get(basename) - if zarr_dim is not None and zarr_dim.size == size: - # Found a dimension in this parent group - # with the right name and size - found_dim_in_parent = True - break - - if not found_dim_in_parent: - # Dimension 'basename' could not be matched to - # any parent group dimensions, so it needs to - # be defined in the current group. + if dimension_mode == "ancestor": + # Assume that the dimension is the same as one + # with the same name and size defined in an + # ancestor group, if one exists. If multiple + # such dimensions exist, then the + # correspondence is with the dimension in the + # ancestor group that is furthest way from the + # root group. + # + # E.g. if the current group is /g1/g2/g3 then + # search groups /g1/g2, /g1, and / in that + # order, stopping if a match is found. If no + # match is found then we define the dimension + # in the current group. + found_dim_in_ancestor = False + group_split = group_name.split(group_separator) + for n in range(len(group_split) - 1, 0, -1): + g = group_separator.join(group_split[:n]) + if g == "": + g = group_separator + + zarr_dim = group_to_dims[g].get(basename) + if zarr_dim is not None and zarr_dim.size == size: + # Found a dimension in this parent + # group with the right name and size + found_dim_in_ancestor = True + break + + if not found_dim_in_ancestor: + # Dimension 'basename' could not be + # matched to any parent group dimensions, + # so define it in the current group. + g = group_name + + elif dimension_mode == "local": + # Assume that the dimension is different to + # any with same name defined in an ancestor + # group. g = group_name - + else: + raise DimensionParsingException( + "Bad value of 'dimension_mode': {dimension_mode!r}" + ) else: g = group_separator.join(name_split[:-1]) if name.endswith(group_separator): # -------------------------------------------- # Dimension name that ends with '/' # - # E.g. g = "dim/" - # E.g. g = "group1/dim/" + # E.g. "dim/" + # E.g. "group1/dim/" # -------------------------------------------- raise DimensionParsingException( "Dimension names can't end with the group " f"separator ({group_separator}): " - f"dimension_name={name}, variable={v.name}" + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" ) - elif ( - name.startswith(group_separator) - and f"..{group_separator}" not in name - ): + elif f"{group_separator}..{group_separator}" in name: # -------------------------------------------- - # Absolute path dimension name that starts - # with '/', and has no upward path traversals - # ('../'). + # Relative path dimension name with upward + # path traversals ('../') not at the start of + # the name # - # E.g. "/dim" - # E.g. "/group1/dim" + # E.g. "/group1/../group2/dim" + # E.g. "group1/../group2/dim" + # E.g. "../group1/../group2/dim" # -------------------------------------------- - if g == "": - g = group_separator + raise DimensionParsingException( + "In Zarr datasets, can't yet deal with a " + "relative path dimension name with upward path " + f"traversals (../) in middle of the name: " + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" + "\n\nPlease raise an issue at " + "https://github.com/NCAS-CMS/cfdm/issues " + "if you would like this feature." + ) elif name.startswith(f"..{group_separator}"): # -------------------------------------------- @@ -2097,8 +2138,8 @@ def _populate_dimension_maps(self, group): # path traversals ('../') at the start of the # name # - # E.g. "../group1/group2/dim" - # E.g. "../../group1/group2/dim" + # E.g. "../group1/dim" + # E.g. "../../group1/dim" # -------------------------------------------- current_group = group while g.startswith(f"..{group_separator}"): @@ -2111,69 +2152,114 @@ def _populate_dimension_maps(self, group): # We're about to go beyond the root # group! raise DimensionParsingException( - "Upward path traversals in dimension " + "Upward path traversals in Zarr dimension " "name go beyond the root group: " - f"dimension_name={name}, variable={v.name}" + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" ) g = group_separator.join((self.path(current_group), g)) - elif f"..{group_separator}" in name: + elif name.startswith(group_separator): # -------------------------------------------- - # Relative path dimension name with upward - # path traversals ('../') not at the start of - # the name + # Absolute path dimension name that starts + # with '/', and contains no upward path + # traversals ('../'). # - # E.g. "/group1/../group2/dim" - # E.g. "group1/../group2/dim" - # E.g. "../group1/../group2/dim" + # E.g. "/dim" + # E.g. "/group1/dim" # -------------------------------------------- - raise DimensionParsingException( - "In Zarr datasets, can't yet deal with a " - "relative path dimension name with upward path " - f"traversals (../) in middle of the name: " - f"dimension_name={name}, variable={v.name}. " - "Please raise an issue at " - "https://github.com/NCAS-CMS/cfdm/issues " - "if you really do need this feature." - ) + if g == "": + g = group_separator else: # -------------------------------------------- # Relative path dimension name which contains - # '/' and which has no upward path traversals + # '/' and which contains no upward path + # traversals ('../'). # - # E.g. "group2/dim" + # E.g. "group1/dim" # -------------------------------------------- g = group_separator.join((group_name, g)) + zarr_dim = None if g in group_to_dims: + # Group 'g' is already registered zarr_dim = group_to_dims[g].get(basename) if zarr_dim is not None: # Dimension 'basename' is already registered # in group 'g' if zarr_dim.size != size: raise DimensionParsingException( - f"Dimension {name} of variable {v.name} " - f"has the wrong size: {size}. It should " - f"have the size {zarr_dim.size}" + f"Zarr Dimension has the wrong size: {size}. " + f"Expected size {zarr_dim.size} " + "(defined by variable " + f"{zarr_dim.reference_variable().name}). " + f"dataset={self.dataset_name()} " + f"variable={var_name} " + f"dimension_name={name}" ) + else: + # Initialise group 'g' + group_to_dims[g] = {} - var_to_dims.setdefault(v.name, []).append(zarr_dim) - continue - # else: - # group_to_dims[g] = {} + if zarr_dim is None: + # Register a new ZarrDimension in group 'g' + zarr_dim = ZarrDimension(basename, size, input_ds[g], v) + group_to_dims[g][basename] = zarr_dim - # Still here? Then we're ready to define dimension - # 'basename' as a ZarrDimension object. - zarr_dim = ZarrDimension(basename, size, input_ds[g]) - var_to_dims.setdefault(v.name, []).append(zarr_dim) - group_to_dims[g][basename] = zarr_dim + # Map the variable to the ZarrDimension object + var_to_dims[var_name] += (zarr_dim,) # Recursively scan all child groups for g in group.group_values(): self._populate_dimension_maps(g) + def _variable_dimension_names(self, var): + """Return the dimension names for a variable. + + Currently this is only required for, and only works for, Zarr + variables. An `AttributeError` will be raised if called for + any other type of variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: + The variable object. + + :Returns: + + `list` of `str` + The variable's dimension names. A scalar variable will + have an empty list. + + """ + zarr_format = var.metadata.zarr_format + match zarr_format: + case 3: + dimensions = var.metadata.dimension_names + case 2: + dimensions = var.metadata.attrs.get("_ARRAY_DIMENSIONS") + case _: + raise DimensionParsingException( + f"Can't flatten a Zarr v{zarr_format} dataset. " + "Only Zarr v3 and v2 can be flattened" + ) + + if dimensions is None: + if var.shape: + raise DimensionParsingException( + f"Non-scalar Zarr v{zarr_format} variable has no " + f"dimension names: {var.name}" + ) + + dimensions = [] + + return dimensions + def _child_groups(self, group): """Return groups that are defined in this group. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 249c9db76..bd857e969 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -11237,25 +11237,12 @@ def _file_variable_attributes(self, var): case "zarr": attrs = dict(var.attrs) - # Remove the _ARRAY_DIMENSIONS from Zarr v2 attributes - if var.info._zarr_format == 2: + # Remove _ARRAY_DIMENSIONS from Zarr v2 attributes + if var.metadata.zarr_format == 2: attrs.pop("_ARRAY_DIMENSIONS", None) return attrs - # try: - # # h5netcdf, zarr - # attrs = dict(var.attrs) - # except AttributeError: - # # netCDF4 - # return {attr: var.getncattr(attr) for attr in var.ncattrs()} - # else: - # if self.read_vars["dataset_opened_with"] == "zarr": - # # zarr: Remove the _ARRAY_DIMENSIONS attribute - # attrs.pop("_ARRAY_DIMENSIONS", None) - # - # return attrs - def _file_variable_dimensions(self, var): """Return the variable dimension names. diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 5244961d9..ca85c7990 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -139,8 +139,8 @@ def _create_variable_name(self, parent, default): except AttributeError: ncvar = default elif not self.write_vars["group"]: - # A flat file has been requested, so strip off any group - # structure from the name. + # A flat dataset has been requested, so strip off any + # group structure from the name. ncvar = self._remove_group_structure(ncvar) return self._name(ncvar) @@ -427,9 +427,9 @@ def _datatype(self, variable): For example, if variable.dtype is 'float32', then 'f4' will be returned. - For a NETCDF4 format file, numpy string data types will either - return `str` regardless of the numpy string length (and a - netCDF4 string type variable will be created) or, if + For a NETCDF4 format dataset, numpy string data types will + either return `str` regardless of the numpy string length (and + a netCDF4 string type variable will be created) or, if `self.write_vars['string']`` is `False`, ``'S1'`` (see below). For all other output netCDF formats (such NETCDF4_CLASSIC, @@ -440,8 +440,8 @@ def _datatype(self, variable): dimension) is expected to be done elsewhere (currently in the _write_netcdf_variable method). - If the input variable has no `!dtype` attribute (or it is None) - then 'S1' is returned, or `str` for NETCDF files. + If the input variable has no `!dtype` attribute (or it is + None) then 'S1' is returned, or `str` for NETCDF datasets. :Parameters: @@ -604,8 +604,8 @@ def _dataset_dimensions(self, field, key, construct): # ---------------------------------------------------- if sample_ncdim is None: # The list variable has not yet been written to - # the file, so write it and also get the dataset - # name of the sample dimension. + # the dataset, so write it and also get the + # dataset name of the sample dimension. list_variable = self.implementation.get_list(construct) sample_ncdim = self._write_list_variable( field, @@ -620,9 +620,9 @@ def _dataset_dimensions(self, field, key, construct): # Compression by contiguous ragged array # # No need to do anything because i) the count variable - # has already been written to the file, ii) we already - # have the position of the sample dimension in the - # compressed array, and iii) we already have the + # has already been written to the dataset, ii) we + # already have the position of the sample dimension in + # the compressed array, and iii) we already have the # dataset name of the sample dimension. # ---------------------------------------------------- pass @@ -632,9 +632,9 @@ def _dataset_dimensions(self, field, key, construct): # Compression by indexed ragged array # # No need to do anything because i) the index variable - # has already been written to the file, ii) we already - # have the position of the sample dimension in the - # compressed array, and iii) we already have the + # has already been written to the dataset, ii) we + # already have the position of the sample dimension in + # the compressed array, and iii) we already have the # dataset name of the sample dimension. # ---------------------------------------------------- pass @@ -712,7 +712,7 @@ def _write_dimension( except RuntimeError as error: message = ( "Can't create unlimited dimension " - f"in {g['netcdf'].data_model} file ({error})." + f"in {g['netcdf'].data_model} dataset ({error})." ) error = str(error) @@ -733,16 +733,16 @@ def _write_dimension( except RuntimeError as error: raise RuntimeError( f"Can't create size {size} dimension {ncdim!r} in " - f"{g['netcdf'].data_model} file ({error})" + f"{g['netcdf'].data_model} dataset ({error})" ) g["dimensions"].add(ncdim) def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): - """Writes a coordinate variable and its bounds variable to file. + """Write a coordinate and bounds variables to the dataset. For netCDF datasets, this also writes a new dimension to the - file and, if required, a new dimension for the bounds. + dataset and, if required, a new dimension for the bounds. :Parameters: @@ -790,12 +790,12 @@ def _write_dimension_coordinate(self, f, key, coord, ncdim, coordinates): # coordinate. create = True - # If the dimension coordinate is already in the file but not - # in an approriate group then we have to create a new dataset - # variable. This is to prevent a downstream error ocurring - # when the parent data variable tries to reference one of its - # dataset dimensions that is not in the same group nor a - # parent group. + # If the dimension coordinate is already in the dataset but + # not in an approriate group then we have to create a new + # dataset variable. This is to prevent a downstream error + # ocurring when the parent data variable tries to reference + # one of its dataset dimensions that is not in the same group + # nor a parent group. if already_in_file and not create: ncvar = coord.nc_get_variable("") groups = self._groups(seen[id(coord)]["ncvar"]) @@ -988,7 +988,7 @@ def _write_scalar_data(self, f, value, ncvar): """Write a dimension coordinate and bounds to the dataset. For netCDF datasets, this also writes a new dimension to the - file and, if required, a new bounds dimension. + dataset and, if required, a new bounds dimension. .. note:: This function updates ``g['seen']``. @@ -1222,7 +1222,7 @@ def _already_in_file(self, variable, ncdims=None, ignore_type=False): `bool` `True` if the variable has already been written to the - file, `False` otherwise. + dataset, `False` otherwise. """ g = self.write_vars @@ -1267,7 +1267,8 @@ def _write_geometry_container(self, field, geometry_container): # Use this existing geometry container return ncvar - # Still here? Then write the geometry container to the file + # Still here? Then write the geometry container to the + # dataset. ncvar = self.implementation.nc_get_geometry_variable( field, default="geometry_container" ) @@ -1372,8 +1373,8 @@ def _write_bounds( bounds, f"bounds{size}" ) if not g["group"]: - # A flat file has been requested, so strip off any group - # structure from the name. + # A flat dataset has been requested, so strip off any + # group structure from the name. bounds_ncdim = self._remove_group_structure(bounds_ncdim) bounds_ncdim = self._name(bounds_ncdim, dimsize=size, role="bounds") @@ -1429,7 +1430,7 @@ def _write_bounds( ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name (for now). ncvar = self._remove_group_structure(ncvar) @@ -1564,7 +1565,7 @@ def _write_node_coordinates( create = False # We need to log the original Bounds variable as being - # in the file, too. This is so that the geometry + # in the dataset, too. This is so that the geometry # container variable can be created later on. g["seen"][id(bounds)] = { "ncvar": ncvar, @@ -1615,7 +1616,7 @@ def _write_node_coordinates( bounds, default=default ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncvar = self._remove_group_structure(ncvar) @@ -1650,7 +1651,7 @@ def _write_node_coordinates( g["geometry_encoding"][ncvar] = encodings # We need to log the original Bounds variable as being in - # the file, too. This is so that the geometry container + # the dataset, too. This is so that the geometry container # variable can be created later on. g["seen"][id(bounds)] = { "ncvar": ncvar, @@ -1718,7 +1719,7 @@ def _write_node_count( ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncvar = self._remove_group_structure(ncvar) @@ -1787,7 +1788,7 @@ def _get_part_ncdimension(self, coord, default=None): if ncdim is not None: # Found a dataset dimension if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncdim = self._remove_group_structure(ncdim) @@ -1923,7 +1924,7 @@ def _get_node_ncdimension(self, bounds, default=None): if ncdim is not None: # Found a dimension if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncdim = self._remove_group_structure(ncdim) @@ -1983,7 +1984,7 @@ def _write_part_node_count(self, f, coord, bounds, encodings): pnc, default="part_node_count" ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. ncvar = self._remove_group_structure(ncvar) @@ -2080,8 +2081,8 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ) if not self.write_vars["group"]: - # A flat file has been requested, so strip off any group - # structure from the name. + # A flat dataset has been requested, so strip off any + # group structure from the name. ncvar = self._remove_group_structure(ncvar) size = self.implementation.get_data_size(interior_ring) @@ -2143,7 +2144,7 @@ def _write_scalar_coordinate( this is not checked. If an equal scalar coordinate has already been written to the - file then the input coordinate is not written. + dataset then the input coordinate is not written. :Parameters: @@ -2194,7 +2195,7 @@ def _write_scalar_coordinate( else: # This scalar coordinate has already been written to the - # file + # dataset ncvar = g["seen"][id(scalar_coord)]["ncvar"] g["axis_to_ncscalar"][axis] = ncvar @@ -2209,7 +2210,7 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): """Write auxiliary coordinates and bounds to the dataset. If an equal auxiliary coordinate has already been written to - the file then the input coordinate is not written. + the dataset then the input coordinate is not written. :Parameters: @@ -2302,8 +2303,8 @@ def _write_auxiliary_coordinate(self, f, key, coord, coordinates): def _write_domain_ancillary(self, f, key, anc): """Write a domain ancillary and its bounds to the dataset. - If an equal domain ancillary has already been written to the file - athen it is not re-written. + If an equal domain ancillary has already been written to the + dataset then it is not re-written. .. versionadded:: (cfdm) 1.7.0 @@ -2389,7 +2390,7 @@ def _write_field_ancillary( """Write a field ancillary to the dataset. If an equal field ancillary has already been written to the - file then it is not re-written. + dataset then it is not re-written. :Parameters: @@ -2438,7 +2439,7 @@ def _write_field_ancillary( def _write_cell_measure(self, f, key, cell_measure): """Write a cell measure construct to the dataset. - If an identical construct has already in the file then the + If an identical construct has already in the dataset then the cell measure will not be written. :Parameters: @@ -2487,11 +2488,12 @@ def _write_cell_measure(self, f, key, cell_measure): self._set_external_variables(ncvar) if ( - g["external_file"] is not None + g["external_dataset"] is not None and self.implementation.get_data(cell_measure, None) is not None ): - # Create a new field to write out to the external file + # Create a new field to write out to the external + # dataset self._create_external( field=f, construct_id=key, @@ -2538,14 +2540,10 @@ def _set_external_variables(self, ncvar): group=g["dataset"], ) - # g["dataset"].setncattr( - # "external_variables", " ".join(sorted(external_variables)) - # ) - def _create_external( self, field=None, construct_id=None, ncvar=None, ncdimensions=None ): - """Creates a new field to flag to write to an external file. + """Creates a new field to flag to write to an external dataset. .. versionadded:: (cfdm) 1.7.0 @@ -2691,10 +2689,11 @@ def _write_grid_mapping(self, f, ref, multiple_grid_mappings): :Parameters: - f: Field construct + f: `Field` or `Domain` ref: `CoordinateReference` - The grid mapping coordinate reference to write to the file. + The grid mapping coordinate reference to write to the + dataset. multiple_grid_mappings: `bool` @@ -2816,7 +2815,7 @@ def _write_netcdf_variable( The netCDF dimension names of the variable cfvar: `Variable` or `Data` - The construct to write to the netCDF file. + The construct to write to the dataset. domain_axes: `None`, or `tuple` of `str` The domain axis construct identifiers for *cfvar*. @@ -3026,9 +3025,9 @@ def _write_netcdf_variable( q, netcdf_parameter, None ) - # Create a quantization container variable in the file, if - # it doesn't already exist (and after having removed any - # per-variable quantization parameters, such as + # Create a quantization container variable in the dataset, + # if it doesn't already exist (and after having removed + # any per-variable quantization parameters, such as # "quantization_nsd"). if quantize_on_write: if g["backend"] == "zarr": @@ -3082,7 +3081,7 @@ def _write_netcdf_variable( if g["fmt"] not in NETCDF4_FMTS: raise ValueError( f"Can't quantize {cfvar!r} into a {g['fmt']} " - "format file. Quantization is only possible when " + "format dataset. Quantization is only possible when " f"writing to one of the {NETCDF4_FMTS} formats." ) @@ -3141,7 +3140,7 @@ def _write_netcdf_variable( # keyword arguments. This is necessary because the # dimensions and dataset chunking strategy will # otherwise reflect the aggregated data in memory, - # rather than the scalar variable in the file. + # rather than the scalar variable in the dataset. kwargs["contiguous"] = True kwargs["chunksizes"] = None kwargs["dimensions"] = () @@ -3175,7 +3174,7 @@ def _write_netcdf_variable( except RuntimeError as error: error = str(error) message = ( - f"Can't create variable in {g['netcdf'].data_model} file " + f"Can't create variable in {g['netcdf'].data_model} dataset " f"from {cfvar!r}: {error}. " f"_createVariable arguments: {kwargs}" ) @@ -3184,13 +3183,13 @@ def _write_netcdf_variable( ): raise ValueError( f"Can't write {cfvar.data.dtype.name} data from {cfvar!r} " - f"to a {g['netcdf'].data_model} file. " + f"to a {g['netcdf'].data_model} dataset. " "Consider using a netCDF4 format, or use the 'datatype' " "parameter, or change the datatype before writing." ) elif error == "NetCDF: NC_UNLIMITED in the wrong index": raise RuntimeError( - f"{message}. In a {g['netcdf'].data_model} file the " + f"{message}. In a {g['netcdf'].data_model} dataset the " "unlimited dimension must be the first (leftmost) " "dimension of the variable. " "Consider using a netCDF4 format." @@ -3353,7 +3352,7 @@ def _write_data( attributes: `dict`, optional The dataset attributes for the constructs that have - been written to the file. + been written to the dataset. construct_type: `str` The construct type of the *cfvar*, or its parent if @@ -3408,7 +3407,7 @@ def _write_data( meta=np.array((), dx.dtype), ) - # Initialise the file lock for the data writing from Dask + # Initialise the dataset lock for the data writing from Dask lock = None # Rechunk the Dask array to shards, if applicable. @@ -3450,7 +3449,7 @@ def _write_data( ) if lock is None: - # We need to define the file lock for data writing from + # We need to define the dataset lock for data writing from # Dask from ...data.locks import netcdf_lock as lock @@ -3602,7 +3601,7 @@ def _convert_to_char(self, data): def _write_field_or_domain( self, f, add_to_seen=False, allow_data_insert_dimension=True ): - """Write a field or domain construct to the file. + """Write a field or domain construct to the dataset. All of the metadata constructs are also written. @@ -3659,28 +3658,28 @@ def _write_field_or_domain( # Mapping of domain axis identifiers to dataset dimension # names. This gets reset for each new field/domain that is - # written to the file. + # written to the dataset. # # For example: {'domainaxis1': 'lon'} g["axis_to_ncdim"] = {} # Mapping of domain axis identifiers to dataset scalar # coordinate variable names. This gets reset for each new - # field/domain that is written to the file. + # field/domain that is written to the dataset. # # For example: {'domainaxis0': 'time'} g["axis_to_ncscalar"] = {} # Mapping of construct internal identifiers to dataset # variable names. This gets reset for each new field/domain - # that is written to the file. + # that is written to the dataset. # # For example: {'dimensioncoordinate1': 'longitude'} g["key_to_ncvar"] = {} # Mapping of construct internal identifiers to their dataset # dimensions. This gets reset for each new field/domain that - # is written to the file. + # is written to the dataset. # # For example: {'dimensioncoordinate1': ['longitude']} g["key_to_ncdims"] = {} @@ -3830,7 +3829,7 @@ def _write_field_or_domain( # ---------------------------------------------------- if axis in data_axes: # The data array spans this domain axis, so write - # the dimension coordinate to the file as a + # the dimension coordinate to the dataset as a # coordinate variable. ncvar = self._write_dimension_coordinate( f, key, dim_coord, ncdim=ncdim, coordinates=coordinates @@ -3851,7 +3850,7 @@ def _write_field_or_domain( # auxiliary coordinates, cell measures, domain # ancillaries or field ancillaries which span # this domain axis. Therefore write the - # dimension coordinate to the file as a + # dimension coordinate to the dataset as a # coordinate variable. ncvar = self._write_dimension_coordinate( f, @@ -3873,8 +3872,8 @@ def _write_field_or_domain( # coordinates, cell measures, domain # ancillaries or field ancillaries which span # this domain axis. Therefore write the - # dimension coordinate to the file as a scalar - # coordinate variable. + # dimension coordinate to the dataset as a + # scalar coordinate variable. coordinates = self._write_scalar_coordinate( f, key, dim_coord, axis, coordinates ) @@ -4022,8 +4021,9 @@ def _write_field_or_domain( ) if not g["group"]: - # A flat file has been requested, so strip - # off any group structure from the name. + # A flat dataset has been requested, so + # strip off any group structure from the + # name. ncdim = self._remove_group_structure(ncdim) ncdim = self._name(ncdim) @@ -4063,8 +4063,8 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by gathering # - # Write the list variable to the file, making a note - # of the dataset sample dimension. + # Write the list variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- list_variable = self.implementation.get_list(f) compress = " ".join(compressed_ncdims) @@ -4076,8 +4076,8 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by contiguous ragged array # - # Write the count variable to the file, making a note - # of the dataset sample dimension. + # Write the count variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- count = self.implementation.get_count(f) sample_ncdim = self._write_count_variable( @@ -4088,16 +4088,16 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by indexed ragged array # - # Write the index variable to the file, making a note - # of the dataset sample dimension. + # Write the index variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- index = self.implementation.get_index(f) index_ncdim = self.implementation.nc_get_dimension( index, default="sample" ) if not g["group"]: - # A flat file has been requested, so strip off any - # group structure from the name. + # A flat dataset has been requested, so strip off + # any group structure from the name. index_ncdim = self._remove_group_structure(index_ncdim) sample_ncdim = self._write_index_variable( @@ -4113,8 +4113,8 @@ def _write_field_or_domain( # ---------------------------------------------------- # Compression by indexed contigous ragged array # - # Write the index variable to the file, making a note - # of the dataset sample dimension. + # Write the index variable to the dataset, making a + # note of the dataset sample dimension. # ---------------------------------------------------- count = self.implementation.get_count(f) count_ncdim = self.implementation.nc_get_dimension( @@ -4122,8 +4122,8 @@ def _write_field_or_domain( ) if not g["group"]: - # A flat file has been requested, so strip off any - # group structure from the name. + # A flat dataset has been requested, so strip off + # any group structure from the name. count_ncdim = self._remove_group_structure(count_ncdim) sample_ncdim = self._write_count_variable( @@ -4131,7 +4131,7 @@ def _write_field_or_domain( ) if not g["group"]: - # A flat file has been requested, so strip off any + # A flat dataset has been requested, so strip off any # group structure from the name. sample_ncdim = self._remove_group_structure(sample_ncdim) @@ -4589,8 +4589,8 @@ def _create_vertical_datum(self, ref, coord_key): ncvar = self.implementation.nc_get_variable(datum) if ncvar is not None: if not self.write_vars["group"]: - # A flat file has been requested, so strip off any - # group structure from the name. + # A flat dataset has been requested, so strip off + # any group structure from the name. ncvar = self._remove_group_structure(ncvar) self.implementation.nc_set_variable(new_grid_mapping, ncvar) @@ -4618,7 +4618,7 @@ def _unlimited(self, field, axis): return self.implementation.nc_is_unlimited_axis(field, axis) def _write_group_attributes(self, fields): - """Writes the group-level attributes to the file. + """Writes the group-level attributes to the dataset. :Parameters: @@ -4667,7 +4667,7 @@ def _write_group_attributes(self, fields): break # -------------------------------------------------------- - # Write the group-level attributes to the file + # Write the group-level attributes to the dataset # -------------------------------------------------------- # Replace None values with actual values for attr, value in this_group_attributes.items(): @@ -4678,17 +4678,9 @@ def _write_group_attributes(self, fields): f0, attr ) - # nc = g["dataset"] # TODOZARR nc = self._get_group(g["dataset"], groups) - # for group in groups: - # print (' nc.groups=', repr(nc.groups)) - # if group in nc.groups: - # nc = nc.groups[group] - # else: - # nc = self._createGroup(nc, group) if not g["dry_run"]: - # nc.setncatts(this_group_attributes) self._set_attributes(this_group_attributes, group=nc) group_attributes[groups] = tuple(this_group_attributes) @@ -4834,7 +4826,7 @@ def _write_global_attributes(self, fields): break # ----------------------------------------------------------- - # Write the Conventions global attribute to the file + # Write the Conventions global attribute to the dataset # ------------------------------------------------------------ delimiter = " " set_Conventions = force_global.pop("Conventions", None) @@ -4879,14 +4871,14 @@ def _write_global_attributes(self, fields): # ) # ------------------------------------------------------------ - # Write the file descriptors to the file + # Write the file descriptors to the dataset # ------------------------------------------------------------ attrs.update(g["file_descriptors"]) # for attr, value in g["file_descriptors"].items(): # g["dataset"].setncattr(attr, value) # ------------------------------------------------------------ - # Write other global attributes to the file + # Write other global attributes to the dataset # ------------------------------------------------------------ attrs.update( { @@ -4900,7 +4892,7 @@ def _write_global_attributes(self, fields): # ) # ------------------------------------------------------------ - # Write "forced" global attributes to the file + # Write "forced" global attributes to the dataset # ------------------------------------------------------------ attrs.update(force_global) @@ -4987,11 +4979,11 @@ def dataset_open(self, dataset_name, mode, fmt, fields): `netCDF4.Dataset` instance. Ignored for Zarr datasets. fields: sequence of `Field` or `Domain` - The constructs to be written to the netCDF file. Note - that these constructs are only used to ascertain if - any data to be written is in *dataset_name*. If this - is the case and mode is "w" then an exception is - raised to prevent *dataset_name* from being deleted. + The constructs to be written to the dataset. Note that + these constructs are only used to ascertain if any + data to be written is in *dataset_name*. If this is + the case and mode is "w" then an exception is raised + to prevent *dataset_name* from being deleted. :Returns: @@ -5015,7 +5007,7 @@ def dataset_open(self, dataset_name, mode, fmt, fields): # mode == 'w' is safer than != 'a' in case of a typo (the # letters are neighbours on a QWERTY keyboard) since 'w' is # destructive. Note that for append ('a') mode the original - # file is never wiped. + # dataset is never wiped. if mode == "w" and g["overwrite"]: self.dataset_remove() @@ -5077,50 +5069,47 @@ def write( cfa="auto", reference_datetime=None, ): - """Write field and domain constructs to a netCDF file. - - NetCDF dimension and variable names will be taken from - variables' `ncvar` attributes and the field attribute - `!ncdimensions` if present, otherwise they are inferred from - standard names or set to defaults. NetCDF names may be - automatically given a numerical suffix to avoid duplication. + """Write field and domain constructs to a dataset. - Output netCDF file global properties are those which occur in the set - of CF global properties and non-standard data variable properties and - which have equal values across all input fields. + Output global properties are those which occur in the set of + CF global properties and non-standard data variable properties + and which have equal values across all input fields. - Logically identical field components are only written to the file - once, apart from when they need to fulfil both dimension coordinate - and auxiliary coordinate roles for different data variables. + Logically identical field components are only written to the + datset once, apart from when they need to fulfil both + dimension coordinate and auxiliary coordinate roles for + different data variables. .. versionadded:: (cfdm) 1.7.0 :Parameters: fields : (sequence of) `cfdm.Field` - The field or fields to write to the file. + The field or fields to write to the dataset. See `cfdm.write` for details. dataset_name: str - The output CF-netCDF file. TODOZARR + The output dataset. See `cfdm.write` for details. mode: `str`, optional - Specify the mode of write access for the output file. One of: + Specify the mode of write access for the output + dataset. One of: ======== ================================================= *mode* Description ======== ================================================= - ``'w'`` Open a new file for writing to. If it exists and - *overwrite* is True then the file is deleted - prior to being recreated. - ``'a'`` Open an existing file for appending new + ``'w'``-- Open a new dataset for writing to. If it + exists and *overwrite* is True then the + dataset is deleted prior to being recreated. + + ``'a'`` Open an existing dataset for appending new information to. The new information will be incorporated whilst the original contents of the - file will be preserved. + dataset will be preserved. In practice this means that new fields will be created, whilst the original fields will not be @@ -5129,7 +5118,7 @@ def write( For append mode, note the following: - * Global attributes on the file + * Global attributes on the dataset will remain the same as they were originally, so will become inaccurate where appended fields have incompatible attributes. To rectify this, @@ -5139,7 +5128,7 @@ def write( `nc_set_global_attribute`. * Fields with incompatible ``featureType`` to - the original file cannot be appended. + the original dataset cannot be appended. * At present fields with groups cannot be appended, but this will be possible in a future @@ -5159,13 +5148,13 @@ def write( ======== ================================================= - By default the file is opened with write access mode - ``'w'``. + By default the dataset is opened with write access + mode ``'w'``. overwrite: bool, optional - If False then raise an exception if the output file - pre-exists. By default a pre-existing output file is - over written. + If False then raise an exception if the output dataset + pre-exists. By default a pre-existing output dataset + is over written. See `cfdm.write` for details. @@ -5173,8 +5162,9 @@ def write( See `cfdm.write` for details. file_descriptors: `dict`, optional - Create description of file contents netCDF global - attributes from the specified attributes and their values. + Create description of dataset contents netCDF global + attributes from the specified attributes and their + values. See `cfdm.write` for details. @@ -5192,9 +5182,9 @@ def write( See `cfdm.write` for details. external: `str`, optional - Write metadata constructs that have data and are marked as - external to the named external file. Ignored if there are - no such constructs. + Write metadata constructs that have data and are + marked as external to the named external + dataset. Ignored if there are no such constructs. See `cfdm.write` for details. @@ -5211,8 +5201,8 @@ def write( See `cfdm.write` for details. endian: `str`, optional - The endian-ness of the output file. Ignored for Zarr - datasets. + The endian-ness of the output dataset. Ignored for + Zarr datasets. See `cfdm.write` for details. @@ -5255,14 +5245,15 @@ def write( string: `bool`, optional By default string-valued construct data are written as - netCDF arrays of type string if the output file format is - ``'NETCDF4'``, or of type char with an extra dimension - denoting the maximum string length for any other output - file format (see the *fmt* parameter). If *string* is False - then string-valued construct data are written as netCDF - arrays of type char with an extra dimension denoting the - maximum string length, regardless of the selected output - file format. + netCDF arrays of type string if the output dataset + format is ``'NETCDF4'`` or ``'ZARR3'``, or of type + char with an extra dimension denoting the maximum + string length for any other output dataset format (see + the *fmt* parameter). If *string* is False then + string-valued construct data are written as netCDF + arrays of type char with an extra dimension denoting + the maximum string length, regardless of the selected + output dataset format. See `cfdm.write` for details. @@ -5278,7 +5269,7 @@ def write( The consequence of writing out-of-range data values is that, by default, these values will be masked when the - file is subsequently read. + dataset is subsequently read. *Parameter example:* If a construct has ``valid_max`` property with value @@ -5289,9 +5280,9 @@ def write( .. versionadded:: (cfdm) 1.8.3 group: `bool`, optional - If False then create a "flat" netCDF file, i.e. one with - only the root group, regardless of any group structure - specified by the field constructs. + If False then create a "flat" netCDF dataset, i.e. one + with only the root group, regardless of any group + structure specified by the field constructs. See `cfdm.write` for details. @@ -5340,7 +5331,7 @@ def write( """ logger.info(f"Writing to {fmt}") # pragma: no cover - # Expand file name + # Expand dataset name dataset_name = os.path.expanduser(os.path.expandvars(dataset_name)) dataset_name = abspath(dataset_name) @@ -5365,10 +5356,14 @@ def write( # ------------------------------------------------------------ self.write_vars = { "dataset_name": dataset_name, - # Format of output file + # Format of output dataset "fmt": None, + # Backend for writing to the dataset + "backend": None, + # Whether the output datset is a file or a directory + "dataset_type": None, # netCDF4.Dataset instance - "netcdf": None, + # "netcdf": None, # Map netCDF variable names to netCDF4.Variable instances "nc": {}, # Map netCDF dimension names to netCDF dimension sizes @@ -5402,8 +5397,8 @@ def write( ), # Data type conversions to be applied prior to writing "datatype": {}, - # Whether or not to write string data-types to netCDF4 - # files (as opposed to car data-types). + # Whether or not to write string data-types to the output + # dataset (as opposed to char data-types). "string": string, # Conventions "Conventions": Conventions, @@ -5431,7 +5426,8 @@ def write( # dimensions keyed by items of the field (such as a # coordinate or a coordinate reference) "seen": {}, - # Dry run: populate 'seen' dict without actually writing to file. + # Dry run: populate 'seen' dict without actually writing + # to dataset. "dry_run": False, # To indicate if the previous iteration was a dry run: # @@ -5450,13 +5446,13 @@ def write( # -------------------------------------------------------- # Configuration options for writing aggregation variables "cfa": None, - # The directory of the aggregation file + # The directory of the aggregation dataset "aggregation_file_directory": None, # Cache the CF aggregation variable write status for each # dataset variable "cfa_write_status": {}, # -------------------------------------------------------- - # Dataset chunking and sharding stategy + # Dataset chunking and sharding stategy # -------------------------------------------------------- "dataset_chunks": dataset_chunks, "dataset_shards": dataset_shards, @@ -5544,7 +5540,7 @@ def write( effective_fields = fields if mode == "a": - # First read in the fields from the existing file: + # First read in the fields from the existing dataset: effective_fields = self._NetCDFRead(self.implementation).read( dataset_name, netcdf_backend="netCDF4" ) @@ -5605,7 +5601,7 @@ def write( ): raise ValueError( "Can't append fields with an incompatible 'featureType' " - "global attribute to the original file." + "global attribute to the original dataset." ) self._file_io_iteration( @@ -5691,9 +5687,9 @@ def _file_io_iteration( warn_valid, group, ): - """Perform a file-writing iteration with the given settings.""" + """Perform a dataset-writing iteration.""" # ------------------------------------------------------------ - # Initiate file IO with given write variables + # Initiate dataset IO with given write variables # ------------------------------------------------------------ if mode == "w": desc = "Writing to" @@ -5733,7 +5729,7 @@ def _file_io_iteration( g["group"] = False elif fmt not in NETCDF4_FMTS + ZARR_FMTS: raise ValueError( - f"Unknown output file format: {fmt!r}. " + f"Unknown output dataset format: {fmt!r}. " f"Valid formats are {NETCDF4_FMTS + NETCDF3_FMTS + ZARR_FMTS}" ) @@ -5800,12 +5796,13 @@ def _file_io_iteration( g["least_significant_digit"] = least_significant_digit g["fmt"] = fmt - if fmt == "ZARR3": - g["backend"] = "zarr" - g["dataset_type"] = "directory" - else: - g["backend"] = "netCDF4" - g["dataset_type"] = "file" + match fmt: + case "ZARR3": + g["backend"] = "zarr" + g["dataset_type"] = "directory" + case _: + g["backend"] = "netCDF4" + g["dataset_type"] = "file" if isinstance( fields, @@ -5837,7 +5834,7 @@ def _file_io_iteration( if self.dataset_exists(dataset_name): if mode == "w" and not overwrite: raise IOError( - "Can't write with mode {mode!r} to existing dataset " + f"Can't write with mode {mode!r} to existing dataset " f"{os.path.abspath(dataset_name)} unless overwrite=True" ) @@ -5854,7 +5851,7 @@ def _file_io_iteration( if not g["dry_run"]: # -------------------------------------------------------- - # Write global properties to the file first. This is + # Write global properties to the dataset first. This is # important as doing it later could slow things down # enormously. This function also creates the # g['global_attributes'] set, which is used in the @@ -5863,7 +5860,7 @@ def _file_io_iteration( self._write_global_attributes(fields) # -------------------------------------------------------- - # Write group-level properties to the file next + # Write group-level properties to the dataset next # -------------------------------------------------------- if ( g["group"] and not g["post_dry_run"] @@ -5882,10 +5879,10 @@ def _file_io_iteration( external = os.path.expanduser(os.path.expandvars(external)) if os.path.realpath(external) == os.path.realpath(dataset_name): raise ValueError( - "Can't set dataset_name and external to the same path" - ) # TODOZARR + "Can't set 'dataset_name' and 'external' to the same path" + ) - g["external_file"] = external + g["external_dataset"] = external # ------------------------------------------------------------ # Write each field construct @@ -5896,18 +5893,18 @@ def _file_io_iteration( # ------------------------------------------------------------ # Write all of the buffered data to disk # ------------------------------------------------------------ - # For append mode, it is cleaner code-wise to close the file - # on the read iteration and re-open it for the append + # For append mode, it is cleaner code-wise to close the + # dataset on the read iteration and re-open it for the append # iteration. So we always close it here. self.dataset_close() # ------------------------------------------------------------ - # Write external fields to the external file + # Write external fields to the external dataset # ------------------------------------------------------------ - if g["external_fields"] and g["external_file"] is not None: + if g["external_fields"] and g["external_dataset"] is not None: self.write( fields=g["external_fields"], - dataset_name=g["external_file"], + dataset_name=g["external_dataset"], fmt=fmt, overwrite=overwrite, datatype=datatype, @@ -6059,7 +6056,7 @@ def _chunking_parameters(self, data, ncdimensions): # KiB'), and the data shape (e.g. (12, 73, 96)). if self._compressed_data(ncdimensions): # Base the dataset chunks on the compressed data that is - # going into the file + # going into the dataset d = self.implementation.get_compressed_array(data) else: d = data @@ -6090,7 +6087,7 @@ def _compressed_data(self, ncdimensions): ncdimensions: `sequence` of `str` The ordered dataset dimension names of the data. These - are the dimensions going into the file, and if the + are the dimensions going into the dataset, and if the data is compressed will differ from the dimensions implied by the data in memory. @@ -6470,8 +6467,8 @@ def _cfa_write_fragment_array_variable( create = not self._already_in_file(data, ncdimensions) if create: - # Create a new fragment array variable in the file, with - # 'contiguous' chunking + # Create a new fragment array variable in the dataset, + # with 'contiguous' chunking ncvar = self._name(ncvar) self._write_netcdf_variable( ncvar, @@ -6483,7 +6480,7 @@ def _cfa_write_fragment_array_variable( ) else: # This fragment array variable has already been written to - # the file + # the dataset ncvar = self.write_vars["seen"][id(data)]["ncvar"] return ncvar @@ -6563,7 +6560,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): f"Can't write {cfvar!r} as a CF aggregation variable. " "This is could be " "because some fragment values in memory have been " - "changed relative to those in the fragment files, " + "changed relative to those in the fragment datasets, " "or a Dask rechunking has occured, etc." ) @@ -6596,7 +6593,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): normalise = not uri_default if uri_relative: - # Get the aggregation file directory as an absolute + # Get the aggregation dataset directory as an absolute # URI aggregation_file_directory = g["aggregation_file_directory"] if aggregation_file_directory is None: @@ -6625,7 +6622,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): data.chunk_indices(), data.chunk_positions() ): # Try to get this Dask chunk's data as a reference to - # fragment file + # fragment dataset fragment = data[index].compute(_force_to_memory=False) try: dataset_name, address, is_subspace, f_index = ( @@ -6642,21 +6639,21 @@ def _cfa_fragment_array_variables(self, data, cfvar): "aggregation variable: " f"The Dask chunk in position {position} " f"(defined by data index {index!r}) does not " - "reference a unique fragment file. This is could be " - "because some fragment values in memory have been " - "changed relative to those in the fragment files, " + "reference a unique fragment dataset. This is could " + "be because some fragment values in memory have been " + "changed relative to those in the fragment datasets, " "or a Dask rechunking has occured, etc." ) if is_subspace: # This Dask chunk's data is a reference to - # fragment file, but only to a subspace of it. + # fragment dataset, but only to a subspace of it. raise AggregationError( f"Can't write {cfvar!r} as a CF " "aggregation variable: " f"The Dask chunk in position {position} " f"(defined by data index {index!r}) references " - f"a subspace ({f_index!r}) of the fragment file " + f"a subspace ({f_index!r}) of the fragment dataset " f"{fragment!r}. This might be fixable by setting " "the 'cfa_write' keyword in the 'read' function." ) @@ -6666,7 +6663,7 @@ def _cfa_fragment_array_variables(self, data, cfvar): dataset_name = abspath(dataset_name) if uri.isabspath(): - # File name is an absolute-path URI reference + # Dataset name is an absolute-path URI reference dataset_name = uricompose( scheme="file", authority="", @@ -6683,10 +6680,10 @@ def _cfa_fragment_array_variables(self, data, cfvar): f"Can't write {cfvar!r} as a CF " "aggregation variable: " "Attempting to create a relative-path URI " - f"reference for the fragment file {fragment}, " + f"reference for the fragment dataset {fragment}, " "referenced by the Dask chunk in position " f"{position} (defined by data index {index!r}), " - "but the aggregation file URI scheme " + "but the aggregation dataset URI scheme " f"({aggregation_file_scheme}:) is incompatible." ) diff --git a/cfdm/read_write/netcdf/zarr.py b/cfdm/read_write/netcdf/zarr.py index 4778d7e5e..4db149e2e 100644 --- a/cfdm/read_write/netcdf/zarr.py +++ b/cfdm/read_write/netcdf/zarr.py @@ -8,7 +8,7 @@ class ZarrDimension: """ - def __init__(self, name, size, group): + def __init__(self, name, size, group, reference_variable=None): """**Initialisation** :Parameters: @@ -22,10 +22,14 @@ def __init__(self, name, size, group): group: `zarr.Group` The group that the dimension is a member of. + reference_variable: `zarr.Array`, optional + The variable that provided the dimension defintion. + """ - self.name = name - self.size = size + self._name = name + self._size = size self._group = group + self._reference_variable = reference_variable def __len__(self): """The size of the dimension. @@ -45,7 +49,25 @@ def __repr__(self): .. versionadded:: (cfdm) 1.12.2.0 """ - return f"" + return f"" + + @property + def name(self): + """Return the dimension name. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self._name + + @property + def size(self): + """Return the dimension size. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self._size def group(self): """Return the group that the dimension is a member of. @@ -74,3 +96,21 @@ def isunlimited(self): """ return False + + def reference_variable(self): + """Return the variable that provided the dimension definition. + + Note that the variable does not have to be in the dimension's + `group`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `zarr.Array` or `None` + The variable that provided the dimension defintion, or + `None` if it wasn't provided during instance + initialisation. + + """ + return self._reference_variable From 59303f580fc8069b87b51121bca0cbf9fbabe78e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 3 Sep 2025 01:00:18 +0100 Subject: [PATCH 19/39] dev --- cfdm/docstring/docstring.py | 42 +++++ cfdm/read_write/netcdf/flatten/flatten.py | 139 ++++++++------- cfdm/read_write/netcdf/netcdfread.py | 207 +++++----------------- cfdm/read_write/read.py | 6 + 4 files changed, 171 insertions(+), 223 deletions(-) diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index da9567342..9f8348553 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -725,6 +725,48 @@ Note that setting ``recursive=True, followlinks=True`` can lead to infinite recursion if a symbolic link points to a parent directory of itself.""", + # read group_dimension_search + "{{read group_dimension_search: `str`, optional}}": """group_dimension_search: `str`, optional + How to interpret a sub-group dimension name that has no + path, i.e. that contains no group-separator characters, + such as ``dim`` (as opposed to ``group/dim``, + ``/group/dim``, etc.). Such a dimension name could be a + variable array dimension name, or be referenced by + variable attribute. + + This is only required for reading a Zarr dataset, for + which there is no means of indicating whether the same + dimension names that appear in different groups correspond + to each other, or not. + + For a non-Zarr dataset that adheres to the netCDF data + model, *group_dimension_search* is ignored because any + correspondence between dimensions is already explicitly + recorded. + + The *group_dimension_search* parameter must be one of: + + * ``'furthest_ancestor'`` + + This is the default. Assume that the Zarr sub-group + dimension is the same as the one with the same name and + size in an ancestor group, if one exists. If multiple + such dimensions exist, then the correspondence is with + the dimension in the ancestor group that is furthest + away from the sub-group. + + * ``'closet_ancestor'`` + + Assume that the Zarr sub-group dimension is the same as + the dimension with the same name and size in an ancestor + group, if one exists. If multiple such dimensions exist, + then the correspondence is with the dimension in the + ancestor group that is closest to the sub-group. + + * ``'local'`` + + Assume that the Zarr sub-group dimension is different to + any with the same name and size in ancestor groups.""", # persist "{{persist description}}": """Persisting turns an underlying lazy dask array into an equivalent chunked dask array, but now with the results fully diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index bdae1b905..3eb26deac 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -52,7 +52,7 @@ def netcdf_flatten( strict=True, omit_data=False, write_chunksize=134217728, - dimension_mode="ancestor", + dimension_search="furthest_ancestor", ): """Create a flattened version of a grouped CF dataset. @@ -119,37 +119,47 @@ def netcdf_flatten( *input_ds* to *output_ds* for each piece. Ignored if *omit_data* is True. - dimension_mode: `str`, optional - How to interpret a dimension name that has no path, - i.e. one that contains no group-separator characters, such - as ``dim``, as opposed to ``group/dim`` or ``/group/dim``, - etc. + dimension_search: `str`, optional + How to interpret a sub-group dimension name that has no + path, i.e. that contains no group-separator characters, + such as ``dim`` (as opposed to ``group/dim``, + ``/group/dim``, etc.). Such a dimension name could be a + variable array dimension name, or be referenced by + variable attribute. + + This is only required for reading a Zarr dataset, for + which there is no means of indicating whether the same + dimension names that appear in different groups correspond + to each other, or not. + + For a non-Zarr dataset that adheres to the netCDF data + model, *dimension_search* is ignored because any + correspondence between dimensions is already explicitly + recorded. - This is only required for Zarr input datasets, for which - there is no means of indicating whether the same dimension - name that appears in different groups correspond to each - other, or not. + The *dimension_search* parameter must be one of: - For non-Zarr datasets that adhere to the netCDF data - model, *dimension_mode* is ignored because any - correspondence between dimensions is already explicitly - recorded in these datasets. + * ``'furthest_ancestor'`` - The *dimension_mode* parameter must be one of: + This is the default. Assume that the Zarr sub-group + dimension is the same as the one with the same name and + size in an ancestor group, if one exists. If multiple + such dimensions exist, then the correspondence is with + the dimension in the ancestor group that is furthest + away from the sub-group. - * ``'ancestor'`` + * ``'closet_ancestor'`` - This is the default. Assume that the dimension is the - same as one with the same name and size defined in an - ancestor group, if one exists. If multiple such - dimensions exist, then the correspondence is with the - dimension in the ancestor group that is furthest way - from the root group. + Assume that the Zarr sub-group dimension is the same as + the dimension with the same name and size in an ancestor + group, if one exists. If multiple such dimensions exist, + then the correspondence is with the dimension in the + ancestor group that is closest to the sub-group. - * ``'local'`` + * ``'local'`` - Assume that the dimension is different to any with same - name defined in ancestor groups. + Assume that the Zarr sub-group dimension is different to + any with the same name and size in ancestor groups. .. versionadded:: (cfdm) NEXTVERSION @@ -164,7 +174,7 @@ def netcdf_flatten( strict, omit_data=omit_data, write_chunksize=write_chunksize, - dimension_mode=dimension_mode, + dimension_search=dimension_search, ).flatten() @@ -295,7 +305,7 @@ def __init__( strict=True, omit_data=False, write_chunksize=134217728, - dimension_mode="ancestor", + dimension_search="furthest_ancestor", ): """**Initialisation** @@ -318,7 +328,7 @@ def __init__( write_chunksize: `int`, optional See `netcdf_flatten`. - dimension_mode: `str`, optional + dimension_search: `str`, optional See `netcdf_flatten`. .. versionadded:: (cfdm) NEXTVERSION @@ -413,7 +423,7 @@ def __init__( self._strict = bool(strict) self._omit_data = bool(omit_data) self._write_chunksize = write_chunksize - self._dimension_mode = dimension_mode + self._dimension_search = dimension_search if ( output_ds == input_ds @@ -1905,8 +1915,8 @@ def _group_dimensions(self, group): :Returns: `dict`-like - The dimensions defined in the group, keyed by the - group name. + The dimensions defined in the group, keyed by their + names. """ match self._backend(): @@ -2011,7 +2021,7 @@ def _populate_dimension_maps(self, group): group_to_dims = self._group_to_dims var_to_dims = self._var_to_dims - dimension_mode = self._dimension_mode + dimension_search = self._dimension_search # Initialise mapping from the group to its ZarrDimension # objects. Use 'setdefault' because a previous call to @@ -2050,48 +2060,57 @@ def _populate_dimension_maps(self, group): # # E.g. "dim" # ------------------------------------------------ - if dimension_mode == "ancestor": - # Assume that the dimension is the same as one - # with the same name and size defined in an - # ancestor group, if one exists. If multiple - # such dimensions exist, then the - # correspondence is with the dimension in the - # ancestor group that is furthest way from the - # root group. - # - # E.g. if the current group is /g1/g2/g3 then - # search groups /g1/g2, /g1, and / in that - # order, stopping if a match is found. If no - # match is found then we define the dimension - # in the current group. - found_dim_in_ancestor = False + if dimension_search in ( + "furthest_ancestor", + "closest_ancestor", + ): + # Find the names of all ancestor groups, in + # the appropriate order for searching. group_split = group_name.split(group_separator) - for n in range(len(group_split) - 1, 0, -1): - g = group_separator.join(group_split[:n]) - if g == "": - g = group_separator + ancestor_names = [ + group_separator.join(group_split[:n]) + for n in range(1, len(group_split)) + ] + ancestor_names[0] = group_separator + # E.g. if the current group is /g1/g2/g3 then + # the ancestor group names are [/, /g1, + # /g1/g2] + + if dimension_search == "closest_ancestor": + # "closest_ancestor" searching requires + # the ancestor group names to be reversed, + # e.g. [/g1/g2, /g1, /] + ancestor_names = ancestor_names[::-1] + # Search through the ancestors in order, + # stopping if we find a matching dimension. + found_dim_in_ancestor = False + for g in ancestor_names: zarr_dim = group_to_dims[g].get(basename) if zarr_dim is not None and zarr_dim.size == size: - # Found a dimension in this parent - # group with the right name and size + # Found a dimension in this ancestor + # group 'g' with the right name and + # size found_dim_in_ancestor = True break if not found_dim_in_ancestor: # Dimension 'basename' could not be - # matched to any parent group dimensions, - # so define it in the current group. + # matched to any ancestor group + # dimensions, so define it in the current + # group. g = group_name - elif dimension_mode == "local": + elif dimension_search == "local": # Assume that the dimension is different to - # any with same name defined in an ancestor - # group. + # any with same name and size defined in any + # ancestor group. g = group_name + else: raise DimensionParsingException( - "Bad value of 'dimension_mode': {dimension_mode!r}" + "Bad value of dimension_search: " + f"{dimension_search!r}" ) else: g = group_separator.join(name_split[:-1]) @@ -2123,7 +2142,7 @@ def _populate_dimension_maps(self, group): raise DimensionParsingException( "In Zarr datasets, can't yet deal with a " "relative path dimension name with upward path " - f"traversals (../) in middle of the name: " + "traversals (../) in middle of the name: " f"dataset={self.dataset_name()} " f"variable={var_name} " f"dimension_name={name}" diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index bd857e969..6730e502f 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -615,7 +615,13 @@ def dataset_open(self, dataset, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - netcdf_flatten(nc, flat_nc, strict=False, omit_data=True) + netcdf_flatten( + nc, + flat_nc, + strict=False, + omit_data=True, + dimension_search=g["group_dimension_search"], + ) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened @@ -943,6 +949,7 @@ def read( dataset_type=None, cdl_string=False, ignore_unknown_type=False, + group_dimension_search="furthest_ancestor", ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -1076,6 +1083,12 @@ def read( .. versionadded:: (cfdm) 1.11.2.0 + group_dimension_search: `str`, optional + How to interpret a group dimension name that has no + path. See `cfdm.read` for details. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `list` @@ -1381,6 +1394,7 @@ def read( # Assume a priori that the dataset does not have a group # structure "has_groups": False, + "group_dimension_search": group_dimension_search, # Keep a list of flattened dataset names "flat_datasets": [], # -------------------------------------------------------- @@ -6718,14 +6732,6 @@ def _create_netcdfarray( if return_kwargs_only: return kwargs - # file_opened_with = g["file_opened_with"] - # if file_opened_with == "netCDF4": - # array = self.implementation.initialise_NetCDF4Array(**kwargs) - # elif file_opened_with == "h5netcdf": - # array = self.implementation.initialise_H5netcdfArray(**kwargs) - # elif file_opened_with == "zarr": - # array = self.implementation.initialise_ZarrArray(**kwargs) - match g["original_dataset_opened_with"]: case "netCDF4": array = self.implementation.initialise_NetCDF4Array( @@ -10915,21 +10921,6 @@ def _dataset_has_groups(self, nc): case "zarr": return bool(tuple(nc.group_keys())) - # if self.read_vars["dataset_opened_with"] == "zarr": - # return bool(tuple(nc.groups())) - # # zarr - # #if len(tuple(nc.groups())) > 1: - # #if tuple(nc.groups()): - # # raise ReadError( - # # "Can't read Zarr dataset that has groups: " - # # f"{self.read_vars['dataset']}" - # # ) - # # - # #return False - # - # # netCDF4, h5netcdf - # return bool(nc.groups) - def _file_global_attribute(self, nc, attr): """Return a global attribute from a dataset. @@ -10955,13 +10946,6 @@ def _file_global_attribute(self, nc, attr): case "netCDF4": return nc.getncattr(attr) - # try: - # # netCDF4 - # return nc.getncattr(attr) - # except AttributeError: - # # h5netcdf, zarr - # return nc.attrs[attr] - def _file_global_attributes(self, nc): """Return the global attributes from a dataset. @@ -10987,13 +10971,6 @@ def _file_global_attributes(self, nc): case "netCDF4": return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} - # try: - # # h5netcdf, zarr - # return nc.attrs - # except AttributeError: - # # netCDF4 - # return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} - def _file_group_variables(self, group): """Return all variables in a group. @@ -11034,9 +11011,6 @@ def _file_dimensions(self, nc): A dictionary of the dimensions keyed by their names. """ - # if hasattr(self, "_cached_file_dimensions"): - # return self._cached_file_dimensions - match self.read_vars["nc_opened_with"]: case "h5netcdf" | "netCDF4": dimensions = dict(nc.dimensions) @@ -11054,29 +11028,8 @@ def _file_dimensions(self, nc): } ) - # self._cached_file_dimensions = dimensions - return dimensions - # try: - # # netCDF4, h5netcdf - # return nc.dimensions - # except AttributeError: - # # zarr - # dimensions = {} - # for var in self._file_variables(nc).values(): - # dimensions.update( - # { - # name: ZarrDimension(name, size, nc) - # for name, size in zip( - # self._file_variable_dimensions(var), var.shape - # ) - # if name not in dimensions - # } - # ) - # - # return dimensions - def _file_dimension(self, nc, dim_name): """Return a dimension from the root group of a dataset. @@ -11127,13 +11080,6 @@ def _file_dimension_isunlimited(self, nc, dim_name): case "zarr": return False - # try: - # # netCDF4, h5netcdf - # return self._file_dimension(nc, dim_name).isunlimited() - # except Exception: - # # zarr - # return False - def _file_dimension_size(self, nc, dim_name): """Return a dimension's size. @@ -11180,13 +11126,6 @@ def _file_variables(self, nc): case "zarr": return dict(nc.arrays()) - # try: - # # netCDF4, h5netcdf - # return nc.variables - # except AttributeError: - # # zarr - # return dict(nc.arrays()) - def _file_variable(self, nc, var_name): """Return a variable. @@ -11277,54 +11216,6 @@ def _file_variable_dimensions(self, var): # Zarr v2 return tuple(var.attrs["_ARRAY_DIMENSIONS"]) - # try: - # # netCDF4, h5netcdf - # return var.dimensions - # except AttributeError: - # try: - # # zarr v3 - # dimension_names = var.metadata.dimension_names - # if dimension_names is None: - # # scalar variable - # dimension_names = () - # - # return dimension_names - # except AttributeError: - # # zarr v2 - # return tuple(var.attrs["_ARRAY_DIMENSIONS"]) - - # def _file_variable_size(self, var): - # """Return the size of a variable's array. - # - # .. versionadded:: (cfdm) 1.11.2.0 - # - # :Parameters: - # - # var: `netCDF4.Variable`, `h5netcdf.Variable`, or `zarr.Array` - # The variable. - # - # :Returns: - # - # `int` - # The array size. - # - # """ - # match self.read_vars["dataset_opened_with"]: - # case 'netCDF4'|'zarr': - # return var.size - # - # case 'h5netcdf': - # return prod(var.shape) - # # Use try/except here because the variable type could differ - # # from that implied by the value of - # # read_vars["dataset_opened_with"] - # try: - # # netCDF4, zarr - # return var.size - # except AttributeError: - # # h5netcdf - # return prod(var.shape) - def _get_storage_options(self, dataset, parsed_dataset): """Get the storage options for accessing a file. @@ -11851,17 +11742,26 @@ def _cache_data_elements(self, data, ncvar, attributes=None): # Also cache the second element for 1-d data, on the # assumption that they may well be dimension coordinate # data. + # + # TODOZARR - do something more clever when all values lie in one chunk. Maybe? if size == 1: indices = (0, -1) value = variable[...] - values = (value, value) + values = [value, value] elif size == 2: indices = (0, 1, -1) - value = variable[-1:] - values = (variable[:1], value, value) + values = variable[...].tolist() + values += [values[-1]] + # value = variable[-1:] + # values = (variable[:1], value, value) + elif size == 3: + indices = (0, 1, -1) + values = variable[...].tolist() else: indices = (0, 1, -1) - values = (variable[:1], variable[1:2], variable[-1:]) + values = variable[:2].tolist() + [variable[-1:]] + # v01 = variable[:2] + # values = (v01[0], v01[1], variable[-1:]) elif ndim == 2 and data.shape[-1] == 2: # Assume that 2-d data with a last dimension of size 2 @@ -11870,58 +11770,39 @@ def _cache_data_elements(self, data, ncvar, attributes=None): # last cells. indices = (0, 1, -2, -1) ndim1 = ndim - 1 - values = ( - variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], - ) + v = variable[(slice(0, 1),) * ndim1 + (slice(0, 2),)] + values = v.squeeze().tolist() + # values = ( + # variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], + # variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], + # ) if data.size == 2: values = values + values else: - values += ( - variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], - ) + v = variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 2),)] + values += v.squeeze().tolist() + # variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], + # variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], + # ] elif size == 1: indices = (0, -1) value = variable[...] - values = (value, value) + values = [value, value] elif size == 3: indices = (0, 1, -1) if char: - values = variable[...].reshape(3, variable.shape[-1]) + values = variable[...].reshape(3, variable.shape[-1]).tolist() else: - values = variable[...].flatten() + values = variable[...].flatten().tolist() else: indices = (0, -1) - values = ( + values = [ variable[(slice(0, 1),) * ndim], variable[(slice(-1, None, 1),) * ndim], - ) + ] # Create a dictionary of the element values elements = {index: value for index, value in zip(indices, values)} - # for index, value in zip(indices, values): - # print (repr(value)) - # if obj: - # value = value.astype(str) - # elif string: - # # Convert an array of objects to an array of strings - # value = np.array(value, dtype="U") - # elif char: - # # Variable is a netCDF classic style char array, so - # # collapse (by concatenation) the outermost (fastest - # # varying) dimension. E.g. [['a','b','c']] becomes - # # ['abc'] - # if dtype.kind == "U": - # value = value.astype("S") - # print ('value=', value, value.dtype) - # a = netCDF4.chartostring(value) - # shape = a.shape - # a = np.array([x.rstrip() for x in a.flat]) - # a = np.reshape(a, shape) - # value = np.ma.masked_where(a == "", a) - - # elements[index] = value # Cache the cached data elements for this variable g["cached_data_elements"][ncvar] = elements diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 669fdd6bf..f7a29a143 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -194,6 +194,10 @@ class read(ReadWrite): .. versionadded:: (cfdm) 1.12.0.0 + {{read group_dimension_search: `str`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + ignore_unknown_type: Deprecated at version 1.12.2.0 Use *dataset_type* instead. @@ -253,6 +257,7 @@ def __new__( followlinks=False, cdl_string=False, extra_read_vars=None, + group_dimension_search="furthest_ancestor", **kwargs, ): """Read field or domain constructs from datasets. @@ -578,6 +583,7 @@ def _read(self, dataset): "dataset_type", "cdl_string", "extra_read_vars", + "group_dimension_search", ) } From 308a99100537a97e6d4822c6a541b877ce554059 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 3 Sep 2025 12:18:49 +0100 Subject: [PATCH 20/39] dev --- Changelog.rst | 8 +- README.md | 8 +- cfdm/__init__.py | 13 -- cfdm/data/h5netcdfarray.py | 11 +- cfdm/data/netcdf4array.py | 13 +- cfdm/data/zarrarray.py | 22 ++- cfdm/read_write/netcdf/flatten/flatten.py | 188 ++++++++++++---------- cfdm/read_write/netcdf/netcdfread.py | 96 ++++++----- cfdm/read_write/netcdf/netcdfwrite.py | 13 +- cfdm/read_write/write.py | 99 ++++++------ requirements.txt | 1 - setup.py | 5 +- 12 files changed, 254 insertions(+), 223 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 2882a952e..a2a8b3839 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,7 +3,11 @@ Version NEXTVERSION **2025-??-??** -* Changed dependency: ``zarr>=3.1.2`` +* Write Zarr v3 datasets with `cfdm.write`, and allow the reading of + grouped Zarr v2 and v3 datasets with `cfdm.read` + (https://github.com/NCAS-CMS/cfdm/issues/???) +* New optional dependency: ``zarr>=3.1.2`` +* Removed dependency (now optional): ``zarr>=3.0.8`` ---- @@ -39,7 +43,7 @@ Version 1.12.2.0 retrieved from disk (https://github.com/NCAS-CMS/cfdm/issues/313) * New keyword parameter to `cfdm.write`: ``chunk_cache`` (https://github.com/NCAS-CMS/cfdm/issues/328) -* Read Zarr datasets with `cfdm.read` +* Read Zarr v2 and v3 datasets with `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/335) * Read multiple datasets simultaneously with `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/336) diff --git a/README.md b/README.md index f6bcfff77..e9d4af6a4 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,12 @@ inspecting it: The ``cfdm`` package can: * read field and domain constructs from netCDF, CDL, and Zarr datasets - with a choice of netCDF backends, and in local, http, and s3 locations, -* be fully flexible with respect to HDF5 chunking, + with a choice of netCDF backends, and in local, http, and s3 + locations, +* be fully flexible with respect to dataset storage chunking, * create new field and domain constructs in memory, -* write and append field and domain constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr v3 + datasets on disk, * read, write, and manipulate UGRID mesh topologies, * read, write, and create coordinates defined by geometry cells, * read and write netCDF4 string data-type variables, diff --git a/cfdm/__init__.py b/cfdm/__init__.py index ac515cefc..c7f6ca659 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -114,19 +114,6 @@ f"Got {h5py.__version__} at {h5py.__file__}" ) -# Check the version of zarr -try: - import zarr -except ImportError as error1: - raise ImportError(_error0 + str(error1)) -else: - _minimum_vn = "3.1.2" - if Version(zarr.__version__) < Version(_minimum_vn): - raise ValueError( - f"Bad zarr version: cfdm requires zarr>={_minimum_vn}. " - f"Got {zarr.__version__} at {zarr.__file__}" - ) - # Check the version of s3fs try: import s3fs diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py index 30a7367b8..ea411630e 100644 --- a/cfdm/data/h5netcdfarray.py +++ b/cfdm/data/h5netcdfarray.py @@ -196,14 +196,15 @@ def get_groups(self, address): return out[:-1], out[-1] def open(self, **kwargs): - """Return a dataset file object and address. - - When multiple files have been provided an attempt is made to - open each one, in the order stored, and a file object is - returned from the first file that exists. + """Return a dataset object and address. .. versionadded:: (cfdm) 1.11.2.0 + :Parameters: + + kwargs: optional + Extra keyword arguments to `h5netcdf.File`. + :Returns: (`h5netcdf.File`, `str`) diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py index 57a796c4d..7b2a4a435 100644 --- a/cfdm/data/netcdf4array.py +++ b/cfdm/data/netcdf4array.py @@ -207,12 +207,13 @@ def get_groups(self, address): out = address.split("/")[1:] return out[:-1], out[-1] - def open(self): - """Return a dataset file object and address. + def open(self, **kwargs): + """Return a dataset object and address. - When multiple files have been provided an attempt is made to - open each one, in the order stored, and a file object is - returned from the first file that exists. + :Parameters: + + kwargs: optional + Extra keyword arguments to `netCDF4.Dataset`. :Returns: @@ -221,4 +222,4 @@ def open(self): address of the data within the file. """ - return super().open(netCDF4.Dataset, mode="r") + return super().open(netCDF4.Dataset, mode="r", **kwargs) diff --git a/cfdm/data/zarrarray.py b/cfdm/data/zarrarray.py index 194228903..67f7dbd81 100644 --- a/cfdm/data/zarrarray.py +++ b/cfdm/data/zarrarray.py @@ -97,11 +97,16 @@ def close(self, dataset): # `zarr.Group` objects don't need closing pass - def open(self): - """Return a dataset file object and address. + def open(self, **kwargs): + """Return a dataset object and address. .. versionadded:: (cfdm) 1.12.2.0 + :Parameters: + + kwargs: optional + Extra keyword arguments to `zarr.open`. + :Returns: (`zarr.Group`, `str`) @@ -109,6 +114,13 @@ def open(self): variable name of the data within the dataset. """ - import zarr - - return super().open(zarr.open, mode="r") + try: + import zarr + except ModuleNotFoundError as error: + error.msg += ( + ". Install the 'zarr' package " + "(https://pypi.org/project/zarr) to read Zarr datasets" + ) + raise + + return super().open(zarr.open, mode="r", **kwargs) diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 3eb26deac..159d5e2a2 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -18,7 +18,8 @@ import re import warnings -from ....functions import is_log_level_debug +from cfdm.functions import is_log_level_debug + from .config import ( flattener_attribute_map, flattener_dimension_map, @@ -50,8 +51,7 @@ def netcdf_flatten( input_ds, output_ds, strict=True, - omit_data=False, - write_chunksize=134217728, + copy_data=True, dimension_search="furthest_ancestor", ): """Create a flattened version of a grouped CF dataset. @@ -100,24 +100,12 @@ def netcdf_flatten( raises an exception. If False, a warning is issued and flattening is continued. - omit_data: `bool`, optional - If True then do not copy the data of any variables from - *input_ds* to *output_ds*. This does not affect the amount - of netCDF variables and dimensions that are written to the - file, nor the netCDF variables' attributes, but for all - variables it does not create data on disk or in - memory. The resulting dataset will be smaller than it - otherwise would have been, and when the new dataset is - accessed the data of these variables will be represented - by an array of all missing data. If False, the default, - then all data arrays are copied. - - write_chunksize: `int`, optional - When *omit_data* is False, the copying of data is done - piecewise to keep memory usage down. *write_chunksize* is - the size in bytes of how much data is copied from - *input_ds* to *output_ds* for each piece. Ignored if - *omit_data* is True. + copy_data: `bool`, optional + By default, *copy_data* is True and all data arrays from + *input_ds* are copied to *output_ds*. If False then no + data arrays are copied, instead all variables' data will + be represented by the fill value, but without having to + actually create these arrays in memory or on disk. dimension_search: `str`, optional How to interpret a sub-group dimension name that has no @@ -172,8 +160,7 @@ def netcdf_flatten( input_ds, output_ds, strict, - omit_data=omit_data, - write_chunksize=write_chunksize, + copy_data=copy_data, dimension_search=dimension_search, ).flatten() @@ -303,8 +290,7 @@ def __init__( input_ds, output_ds, strict=True, - omit_data=False, - write_chunksize=134217728, + copy_data=True, dimension_search="furthest_ancestor", ): """**Initialisation** @@ -322,10 +308,7 @@ def __init__( strict: `bool`, optional See `netcdf_flatten`. - omit_data: `bool`, optional - See `netcdf_flatten`. - - write_chunksize: `int`, optional + copy_data: `bool`, optional See `netcdf_flatten`. dimension_search: `str`, optional @@ -387,7 +370,7 @@ def __init__( # '/forecast/model': {}} # # Cuurently this mapping is only required for an input - # `zarr.Group` dataset, and is populated by + # `zarr.Group` dataset, and is generated by # `_populate_dimension_maps`. self._group_to_dims = {} @@ -400,7 +383,7 @@ def __init__( # 'forecast/y': (=1) netCDF string type variable + # comes out as a numpy object array, so + # convert it to numpy string array. + array = array.astype("U", copy=False) + # netCDF4 doesn't auto-mask VLEN variables + # array = np.ma.where(array == "", + # np.ma.masked, array) + array = np.ma.masked_values(array, "") + + old_var = array + + case "zarr": + array = old_var[...] + array = array.astype("O", copy=False).astype( + "U", copy=False + ) + fill_value = old_var.attrs.get( + "_FillValue", old_var.attrs.get("missing_value", "") + ) + array = np.where(array == "", fill_value, array) + old_var = array - # Get next position - var_end_reached = not self.increment_pos( - pos, 0, chunk_shape, shape + if isinstance(old_var, np.ndarray): + new_var[...] = old_var + else: + dx = da.from_array(old_var) + da.store( + dx, + new_var, + compute=True, + return_stored=False, + lock=netcdf_lock, ) def resolve_reference(self, orig_ref, orig_var, rules): @@ -1994,10 +2006,10 @@ def _populate_dimension_maps(self, group): **Zarr datasets** - Populating the `_group_to_dims` dictionary is only required - for a Zarr grouped dataset, for which this information is not - explicitly defined in the format's data model (unlike for - netCDF-3 and netCDF-4 datasets). + Populating the `_group_to_dims` dictionary is currently only + required for a Zarr grouped dataset, for which this + information is not explicitly defined in the format's data + model (unlike for netCDF and HDF5 datasets). See `netcdf_flatten` for details @@ -2020,7 +2032,6 @@ def _populate_dimension_maps(self, group): input_ds = self._input_ds group_to_dims = self._group_to_dims var_to_dims = self._var_to_dims - dimension_search = self._dimension_search # Initialise mapping from the group to its ZarrDimension @@ -2078,7 +2089,7 @@ def _populate_dimension_maps(self, group): if dimension_search == "closest_ancestor": # "closest_ancestor" searching requires - # the ancestor group names to be reversed, + # the ancestor group order to be reversed, # e.g. [/g1/g2, /g1, /] ancestor_names = ancestor_names[::-1] @@ -2109,7 +2120,7 @@ def _populate_dimension_maps(self, group): else: raise DimensionParsingException( - "Bad value of dimension_search: " + "Bad 'dimension_search' value: " f"{dimension_search!r}" ) else: @@ -2224,8 +2235,13 @@ def _populate_dimension_maps(self, group): group_to_dims[g] = {} if zarr_dim is None: - # Register a new ZarrDimension in group 'g' - zarr_dim = ZarrDimension(basename, size, input_ds[g], v) + # Register a new ZarrDimension in a group + defining_group = input_ds.get(g) + if defining_group is None: + # Must be the root group + defining_group = input_ds + + zarr_dim = ZarrDimension(basename, size, defining_group, v) group_to_dims[g][basename] = zarr_dim # Map the variable to the ZarrDimension object diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 6730e502f..c46c44dbe 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -619,7 +619,7 @@ def dataset_open(self, dataset, flatten=True, verbose=None): nc, flat_nc, strict=False, - omit_data=True, + copy_data=False, dimension_search=g["group_dimension_search"], ) @@ -711,7 +711,10 @@ def _open_zarr(self, dataset): try: import zarr except ModuleNotFoundError as error: - error.msg += ". Install the 'zarr' package to read Zarr datasets" + error.msg += ( + ". Install the 'zarr' package " + "(https://pypi.org/project/zarr) to read Zarr datasets" + ) raise nc = zarr.open(dataset, mode="r") @@ -11360,7 +11363,7 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): # No Dask chunking return -1 - storage_chunks = self._dataset_chunksizes(g["variables"][ncvar]) + storage_chunks = self._variable_chunksizes(g["variables"][ncvar]) ndim = array.ndim if ( @@ -11705,30 +11708,23 @@ def _cache_data_elements(self, data, ncvar, attributes=None): return # ------------------------------------------------------------ - # Still here? then there were no cached data elements, so we - # have to create them. + # Still here? Then there were no cached data elements, so we + # have to create them. # ------------------------------------------------------------ + + # Include optimisations for the common case that the entire + # array is stored in one dataset chunk (which does *not* + # include netCDF contiguous arrays), that prevent the reading + # of that chunk multiple times. + one_chunk = self._variable_chunksizes(variable) == variable.shape + # Get the required element values size = data.size ndim = data.ndim - # Whether or not this is an array of strings - dtype = variable.dtype - string = dtype == str - obj = not string and dtype.kind == "O" - - # Whether or not this is an array of chars - if ( - not (string or obj) - and dtype.kind in "SU" - and variable.ndim == ndim + 1 - ): - # This variable is a netCDF classic style char array with - # a trailing dimension that needs to be collapsed - char = True - else: - char = False - + # Get the values using `netcdf_indexer`, as this conveniently + # deals with different type of indexing, string and character + # arrays, etc. variable = netcdf_indexer( variable, mask=True, @@ -11742,8 +11738,6 @@ def _cache_data_elements(self, data, ncvar, attributes=None): # Also cache the second element for 1-d data, on the # assumption that they may well be dimension coordinate # data. - # - # TODOZARR - do something more clever when all values lie in one chunk. Maybe? if size == 1: indices = (0, -1) value = variable[...] @@ -11752,16 +11746,15 @@ def _cache_data_elements(self, data, ncvar, attributes=None): indices = (0, 1, -1) values = variable[...].tolist() values += [values[-1]] - # value = variable[-1:] - # values = (variable[:1], value, value) elif size == 3: indices = (0, 1, -1) values = variable[...].tolist() else: indices = (0, 1, -1) - values = variable[:2].tolist() + [variable[-1:]] - # v01 = variable[:2] - # values = (v01[0], v01[1], variable[-1:]) + if one_chunk: + values = variable[list(indices)].tolist() + else: + values = variable[:2].tolist() + [variable[-1:]] elif ndim == 2 and data.shape[-1] == 2: # Assume that 2-d data with a last dimension of size 2 @@ -11770,36 +11763,39 @@ def _cache_data_elements(self, data, ncvar, attributes=None): # last cells. indices = (0, 1, -2, -1) ndim1 = ndim - 1 - v = variable[(slice(0, 1),) * ndim1 + (slice(0, 2),)] - values = v.squeeze().tolist() - # values = ( - # variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], - # variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], - # ) + if one_chunk: + v = variable[...] + else: + v = variable + + index = (slice(0, 1),) * ndim1 + (slice(0, 2),) + values = v[index].squeeze().tolist() if data.size == 2: values = values + values else: - v = variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 2),)] - values += v.squeeze().tolist() - # variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], - # variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], - # ] + index = (slice(-1, None, 1),) * ndim1 + (slice(0, 2),) + values += v[index].squeeze().tolist() + + del v + elif size == 1: indices = (0, -1) value = variable[...] values = [value, value] elif size == 3: indices = (0, 1, -1) - if char: - values = variable[...].reshape(3, variable.shape[-1]).tolist() - else: - values = variable[...].flatten().tolist() + values = variable[...].flatten().tolist() else: indices = (0, -1) - values = [ - variable[(slice(0, 1),) * ndim], - variable[(slice(-1, None, 1),) * ndim], - ] + if one_chunk: + v = variable[...] + values = [v.item(0), v.item(-1)] + del v + else: + values = [ + variable[(slice(0, 1),) * ndim], + variable[(slice(-1, None, 1),) * ndim], + ] # Create a dictionary of the element values elements = {index: value for index, value in zip(indices, values)} @@ -11810,8 +11806,8 @@ def _cache_data_elements(self, data, ncvar, attributes=None): # Store the elements in the data object data._set_cached_elements(elements) - def _dataset_chunksizes(self, variable): - """Return the variable chunk sizes. + def _variable_chunksizes(self, variable): + """Return the dataset variable chunk sizes. .. versionadded:: (cfdm) 1.11.2.0 diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index ca85c7990..631c6ccfb 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2631,6 +2631,7 @@ def _createVariable(self, **kwargs): shards = kwargs.get("shards") if chunks is None: + # One chunk for the entire array chunks = shape if shards is not None: @@ -2937,6 +2938,7 @@ def _write_netcdf_variable( logger.debug( f" chunksizes: {chunksizes}\n" f" contiguous: {contiguous}" + f" shards : {shards}" ) # pragma: no cover # ------------------------------------------------------------ @@ -3441,7 +3443,8 @@ def _write_data( if zarr: # `zarr` can't write a masked array to a variable, so we - # have to replace missing data with the fill value. + # have to manually replace missing data with the fill + # value. dx = dx.map_blocks( self._filled_array, meta=np.array((), dx.dtype), @@ -3451,7 +3454,7 @@ def _write_data( if lock is None: # We need to define the dataset lock for data writing from # Dask - from ...data.locks import netcdf_lock as lock + from cfdm.data.locks import netcdf_lock as lock da.store( dx, g["nc"][ncvar], compute=True, return_stored=False, lock=lock @@ -5023,7 +5026,8 @@ def dataset_open(self, dataset_name, mode, fmt, fields): import zarr except ModuleNotFoundError as error: error.msg += ( - ". Install the 'zarr' package to write Zarr datasets" + ". Install the 'zarr' package " + "(https://pypi.org/project/zarr) to read Zarr datasets" ) raise @@ -5913,6 +5917,9 @@ def _file_io_iteration( fletcher32=fletcher32, shuffle=shuffle, extra_write_vars=extra_write_vars, + chunk_cache=chunk_cache, + dataset_chunks=g["dataset_chunks"], + dataset_shards=g["dataset_shards"], ) def _int32(self, array): diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index f6f354908..e67105ae6 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -6,21 +6,21 @@ class write(ReadWrite): - """Write field and domain constructs to a netCDF file. + """Write field and domain constructs to a netCDF or Zarr dataset. - **File format** + **Dataset format** - See the *fmt* parameter for details on which output netCDF file + See the *fmt* parameter for details on which output dataset formats are supported. - **NetCDF variable and dimension names** + **Dataset variable and dimension names** - These names are stored within constructs read a from dataset, or - may be set manually. They are used when writing a field construct - to the file. If a name has not been set then one will be - constructed (usually based on the standard name if it exists). The - names may be modified internally to prevent duplication in the - file. + These names are stored within constructs and are either read a + from another dataset or may be set manually. They are used when + writing a field construct to the dataset. If a name has not been + set then one will be constructed (usually based on the standard + name if it exists). The names may be modified internally to + prevent duplication in the dataset. Each construct, or construct component, that corresponds to a netCDF variable has the following methods to get, set and remove a @@ -33,19 +33,19 @@ class write(ReadWrite): `~{{package}}.DomainAxis.nc_set_dimension` and `~{{package}}.DomainAxis.nc_del_dimension`. - **NetCDF attributes** + **Dataset attributes** - Field construct properties may be written as netCDF global - attributes and/or netCDF data variable attributes. See the - *file_descriptors*, *global_attributes* and *variable_attributes* - parameters for details. + Field construct properties may be written as global attributes + (i.e. attributes of the root group) and/or data variable + attributes. See the *file_descriptors*, *global_attributes* and + *variable_attributes* parameters for details. **External variables** - Metadata constructs marked as external are omitted from the file - and referred to via the netCDF ``external_variables`` global - attribute. However, omitted constructs may be written to an - external file (see the *external* parameter for details). + Metadata constructs marked as external are omitted from the + dataset and referred to via the CF ``external_variables`` global + attribute. However, the omitted constructs may be written to an + external dataset (see the *external* parameter for details). **NetCDF unlimited dimensions** @@ -95,7 +95,7 @@ class write(ReadWrite): ``'~/file.nc'``, ``'~/tmp/../file.nc'``. fmt: `str`, optional - The format of the output file. One of: + The format of the output dataset. One of: ========================== ============================== *fmt* Output dataset type @@ -120,13 +120,13 @@ class write(ReadWrite): file with extensions (see below) - ``'ZARR3'`` Zarr v3 + ``'ZARR3'`` Zarr v3 dataset ========================== ============================== By default the format is ``'NETCDF4'``. All NETCDF formats support large files (i.e. those greater - than 2GB) except ``'NETCDF3_CLASSIC'``. + than 2GB), except ``'NETCDF3_CLASSIC'``. ``'NETCDF3_64BIT_DATA'`` is a format that requires version 4.4.0 or newer of the C library (use @@ -152,14 +152,14 @@ class write(ReadWrite): ======== ================================================= *mode* Description ======== ================================================= - ``'w'`` Open a new file for writing to. If it exists and - *overwrite* is True then the file is deleted - prior to being recreated. + ``'w'`` Open a new dataset for writing to. If it exists + and *overwrite* is True then the dataset is + deleted prior to being recreated. - ``'a'`` Open an existing file for appending new + ``'a'`` Open an existing dataset for appending new information to. The new information will be incorporated whilst the original contents of the - file will be preserved. + dataset will be preserved. In practice this means that new fields will be created, whilst the original fields will not be @@ -168,7 +168,7 @@ class write(ReadWrite): For append mode, note the following: - * Global attributes on the file + * Global attributes on the dataset will remain the same as they were originally, so will become inaccurate where appended fields have incompatible attributes. To rectify this, @@ -178,7 +178,7 @@ class write(ReadWrite): `nc_set_global_attribute`. * Fields with incompatible ``featureType`` to - the original file cannot be appended. + the original dataset cannot be appended. * At present fields with groups cannot be appended, but this will be possible in a future @@ -340,7 +340,7 @@ class write(ReadWrite): numpy.dtype('int32')}``. endian: `str`, optional - The endian-ness of the output file. Valid values are + The endian-ness of the output dataset. Valid values are ``'little'``, ``'big'`` or ``'native'``. By default the output is native endian. See the `netCDF4 package `_ for more @@ -553,21 +553,24 @@ class write(ReadWrite): .. note:: By default, a data array returned by `{{package}}.read` stores its dataset chunking - strategy from the file being read. When this + strategy from the dataset being read. When this happens that same dataset chunking strategy will be used when the data is written to a new - netCDF4 file, unless the strategy was modified - or removed prior to writing. To prevent the - dataset chunking strategy from the original file - being stored, see the *store_dataset_chunks* - parameter to `{{package}}.read`. + netCDF4 or Zarr dataset, unless the strategy was + modified or removed prior to writing. To prevent + the dataset chunking strategy from the original + dataset being stored, see the + *store_dataset_chunks* parameter to + `{{package}}.read`. The *dataset_chunks* parameter may be one of: * ``'contiguous'`` - The data will written to the file contiguously, i.e. no - chunking. + The data will written to the dataset contiguously, + i.e. no chunking. For a Zarr dataset, this is + implemented as a single dataset chunk for the entire + array. * `int` or `float` or `str` @@ -750,18 +753,18 @@ class write(ReadWrite): * ``'uri'``: `str` - Specify the URI format of the fragment file names. + Specify the URI format of the fragment dataset names. - If ``'default'`` (the default) then the fragment file + If ``'default'`` (the default) then the fragment dataset names will be written with the same URI formats that - they had when read from input files (for file names - originating from the reading of normal non-aggregation - variables, this will result in absolute URIs). If - ``'absolute'`` then all fragment file names will be - written as absolute URIs. If ``'relative'`` then all - fragment file names will be written as relative-path URI - references URIs, relative to the location of the - aggregation file. + they had when read from input datasets (for dataset + names originating from the reading of normal + non-aggregation variables, this will result in absolute + URIs). If ``'absolute'`` then all fragment dataset names + will be written as absolute URIs. If ``'relative'`` then + all fragment dataset names will be written as + relative-path URI references URIs, relative to the + location of the aggregation dataset. * ``'strict'``: `bool` diff --git a/requirements.txt b/requirements.txt index 768a5bac3..f41a3712e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,3 @@ dask>=2025.5.1 distributed>=2025.5.1 uritools>=4.0.3 cfunits>=3.3.7 -zarr>=3.1.2 diff --git a/setup.py b/setup.py index a800a0510..10ea52930 100755 --- a/setup.py +++ b/setup.py @@ -58,7 +58,7 @@ def _get_version(): * read field and domain constructs from netCDF, CDL, and Zarr datasets with a choice of netCDF backends, * be fully flexible with respect to dataset storage chunking, * create new field and domain constructs in memory, -* write and append field and domain constructs to netCDF datasets on disk, +* write and append field and domain constructs to netCDF and Zarr v3 datasets on disk, * read, write, and manipulate UGRID mesh topologies, * read, write, and create coordinates defined by geometry cells, * read and write netCDF4 string data-type variables, @@ -136,6 +136,9 @@ def _get_version(): "flake8", "pydocstyle", ], + "zarr": [ + "zarr>=3.1.2", + ], } setup( From ccf933bacb03eed9bfeaaf22af0ba9f03107cc4f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 3 Sep 2025 12:32:10 +0100 Subject: [PATCH 21/39] dev --- Changelog.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Changelog.rst b/Changelog.rst index a2a8b3839..0a0603f61 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -5,7 +5,9 @@ Version NEXTVERSION * Write Zarr v3 datasets with `cfdm.write`, and allow the reading of grouped Zarr v2 and v3 datasets with `cfdm.read` - (https://github.com/NCAS-CMS/cfdm/issues/???) + (https://github.com/NCAS-CMS/cfdm/issues/354) +* Read Zarr v2 and v3 datasets that contain a group hierarchy with + `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/355) * New optional dependency: ``zarr>=3.1.2`` * Removed dependency (now optional): ``zarr>=3.0.8`` From 548d039d2fd28dac02c15c6cbfd7aa8b1e6a2fe6 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 15 Sep 2025 19:15:03 +0100 Subject: [PATCH 22/39] dev --- cfdm/data/aggregatedarray.py | 3 +- cfdm/data/netcdfindexer.py | 5 +++ cfdm/docstring/docstring.py | 44 +++++++++--------- cfdm/mixin/netcdf.py | 4 +- cfdm/read_write/netcdf/flatten/flatten.py | 55 +++++++++++++---------- cfdm/read_write/netcdf/netcdfread.py | 34 +++++++------- cfdm/read_write/netcdf/netcdfwrite.py | 41 ++++++++++------- cfdm/read_write/netcdf/zarr.py | 8 ++-- docs/source/installation.rst | 20 ++++++++- docs/source/introduction.rst | 3 +- docs/source/tutorial.rst | 6 +-- 11 files changed, 135 insertions(+), 88 deletions(-) diff --git a/cfdm/data/aggregatedarray.py b/cfdm/data/aggregatedarray.py index 6a5d62516..41f0c0641 100644 --- a/cfdm/data/aggregatedarray.py +++ b/cfdm/data/aggregatedarray.py @@ -265,9 +265,10 @@ def _parse_fragment_array(self, aggregated_filename, fragment_array): uri = fa_uris[index] try: + # 'uri' is a size 1 numpy array uri = uri.item() except AttributeError: - # E.g. if 'uri' is a `str` instance + # E.g. 'uri' is a `str` instance pass parsed_fragment_array[index] = { diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 846e2bfdd..e50fbfd68 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -397,6 +397,8 @@ def _default_FillValue(self, dtype): return default_fillvals["S1"] if kind == "T": + # np.dtypes.StringDType, which stores variable-width + # string data in a UTF-8 encoding, as used by `zarr`) return "" return default_fillvals[dtype.str[1:]] @@ -622,6 +624,9 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): if fvalisnan: mask = np.isnan(data) else: + # Must use `np.asanyarray` here, to ensure that that + # 'mask' is a never a `bool`, which would make the + # following 'mask.any' call' fail. mask = np.asanyarray(data == fval) if mask.any(): diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 9f8348553..f3adb4998 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -550,13 +550,12 @@ written to a new netCDF file with `{{package}}.write` (unless the strategy is modified prior to writing). - If False, or if the dataset being read does not support - chunking (such as a netCDF-3 dataset), then no dataset - chunking strategy is stored (i.e. an - `nc_dataset_chunksizes` method will return `None` for all - returned `Data` objects). In this case, when the data is - written to a new netCDF file, the dataset chunking - strategy will be determined by `{{package}}.write`. + If False then no dataset chunking strategy is stored + (i.e. the `nc_dataset_chunksizes` method will return + `None` for all returned `Data` objects). In this case, + when the data is written to a new dataset, the dataset + chunking strategy will be determined by + `{{package}}.write`. See the `{{package}}.write` *dataset_chunks* parameter for details on how the dataset chunking strategy is determined @@ -572,11 +571,11 @@ (unless the strategy is modified prior to writing). If False, or if the dataset being read does not support - sharding (such as a netCDF-4 dataset), then no dataset - sharding strategy is stored (i.e. an `nc_dataset_shards` + sharding (such as a netCDF dataset), then no dataset + sharding strategy is stored (i.e. the `nc_dataset_shards` method will return `None` for all returned `Data` objects). In this case, when the data is written to a new - Zarr dataset, the dataset shardinging strategy will be + Zarr dataset, the dataset sharding strategy will be determined by `{{package}}.write`.""", # read cfa "{{read cfa: `dict`, optional}}": """cfa: `dict`, optional @@ -730,9 +729,9 @@ How to interpret a sub-group dimension name that has no path, i.e. that contains no group-separator characters, such as ``dim`` (as opposed to ``group/dim``, - ``/group/dim``, etc.). Such a dimension name could be a - variable array dimension name, or be referenced by - variable attribute. + ``/group/dim``, ``../dim``, etc.). Such a dimension name + could be a variable array dimension name, or be referenced + by variable attribute. This is only required for reading a Zarr dataset, for which there is no means of indicating whether the same @@ -740,9 +739,10 @@ to each other, or not. For a non-Zarr dataset that adheres to the netCDF data - model, *group_dimension_search* is ignored because any + model (such as a netCDF-4 dataset), + *group_dimension_search* **is ignored** because any correspondence between dimensions is already explicitly - recorded. + defined. The *group_dimension_search* parameter must be one of: @@ -752,8 +752,9 @@ dimension is the same as the one with the same name and size in an ancestor group, if one exists. If multiple such dimensions exist, then the correspondence is with - the dimension in the ancestor group that is furthest - away from the sub-group. + the dimension in the ancestor group that is **furthest + away** from the sub-group (i.e. that is closest to the + root group). * ``'closet_ancestor'`` @@ -761,12 +762,13 @@ the dimension with the same name and size in an ancestor group, if one exists. If multiple such dimensions exist, then the correspondence is with the dimension in the - ancestor group that is closest to the sub-group. + ancestor group that is **closest to** the sub-group + (i.e. that is furthest away from the root group). * ``'local'`` Assume that the Zarr sub-group dimension is different to - any with the same name and size in ancestor groups.""", + any with the same name and size in all ancestor groups.""", # persist "{{persist description}}": """Persisting turns an underlying lazy dask array into an equivalent chunked dask array, but now with the results fully @@ -1387,7 +1389,7 @@ of the file system. The sharding strategy is ignored when writing to a non-Zarr - datset.""", + dataset.""", # ---------------------------------------------------------------- # Method description substitutions (4 levels of indentation) # ---------------------------------------------------------------- @@ -1424,7 +1426,7 @@ The integer number of chunks to be stored in a single shard, favouring an equal number of chunks - along each shard dimenson. + along each shard dimension. * sequence of `int` diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index 11b53810f..bb427e1cd 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -32,7 +32,7 @@ def __initialise_from_source(self, source, copy=True): :Parameters: source: - N The object from which to extract the initialisation + The object from which to extract the initialisation information. Typically, but not necessarily, a `{{class}}` object. @@ -5143,7 +5143,7 @@ class NetCDFShards(NetCDFMixin): they are smaller than the block size of the file system. The sharding strategy is ignored when writing to a non-Zarr - datset. + dataset. .. versionadded:: (cfdm) NEXTVERSION diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 159d5e2a2..5444aa727 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -52,7 +52,7 @@ def netcdf_flatten( output_ds, strict=True, copy_data=True, - dimension_search="furthest_ancestor", + group_dimension_search="furthest_ancestor", ): """Create a flattened version of a grouped CF dataset. @@ -107,13 +107,13 @@ def netcdf_flatten( be represented by the fill value, but without having to actually create these arrays in memory or on disk. - dimension_search: `str`, optional + group_dimension_search: `str`, optional How to interpret a sub-group dimension name that has no path, i.e. that contains no group-separator characters, such as ``dim`` (as opposed to ``group/dim``, - ``/group/dim``, etc.). Such a dimension name could be a - variable array dimension name, or be referenced by - variable attribute. + ``/group/dim``, ``../dim``, etc.). Such a dimension name + could be a variable array dimension name, or be referenced + by variable attribute. This is only required for reading a Zarr dataset, for which there is no means of indicating whether the same @@ -121,11 +121,12 @@ def netcdf_flatten( to each other, or not. For a non-Zarr dataset that adheres to the netCDF data - model, *dimension_search* is ignored because any + model (such as a netCDF-4 dataset), + *group_dimension_search* **is ignored** because any correspondence between dimensions is already explicitly - recorded. + defined. - The *dimension_search* parameter must be one of: + The *group_dimension_search* parameter must be one of: * ``'furthest_ancestor'`` @@ -133,8 +134,9 @@ def netcdf_flatten( dimension is the same as the one with the same name and size in an ancestor group, if one exists. If multiple such dimensions exist, then the correspondence is with - the dimension in the ancestor group that is furthest - away from the sub-group. + the dimension in the ancestor group that is **furthest + away** from the sub-group (i.e. that is closest to the + root group). * ``'closet_ancestor'`` @@ -142,12 +144,13 @@ def netcdf_flatten( the dimension with the same name and size in an ancestor group, if one exists. If multiple such dimensions exist, then the correspondence is with the dimension in the - ancestor group that is closest to the sub-group. + ancestor group that is **closest to** the sub-group + (i.e. that is furthest away from the root group). * ``'local'`` Assume that the Zarr sub-group dimension is different to - any with the same name and size in ancestor groups. + any with the same name and size in all ancestor groups. .. versionadded:: (cfdm) NEXTVERSION @@ -161,7 +164,7 @@ def netcdf_flatten( output_ds, strict, copy_data=copy_data, - dimension_search=dimension_search, + group_dimension_search=group_dimension_search, ).flatten() @@ -291,7 +294,7 @@ def __init__( output_ds, strict=True, copy_data=True, - dimension_search="furthest_ancestor", + group_dimension_search="furthest_ancestor", ): """**Initialisation** @@ -311,7 +314,7 @@ def __init__( copy_data: `bool`, optional See `netcdf_flatten`. - dimension_search: `str`, optional + group_dimension_search: `str`, optional See `netcdf_flatten`. .. versionadded:: (cfdm) NEXTVERSION @@ -405,7 +408,7 @@ def __init__( self._strict = bool(strict) self._copy_data = bool(copy_data) - self._dimension_search = dimension_search + self._group_dimension_search = group_dimension_search if ( output_ds == input_ds @@ -2032,7 +2035,7 @@ def _populate_dimension_maps(self, group): input_ds = self._input_ds group_to_dims = self._group_to_dims var_to_dims = self._var_to_dims - dimension_search = self._dimension_search + group_dimension_search = self._group_dimension_search # Initialise mapping from the group to its ZarrDimension # objects. Use 'setdefault' because a previous call to @@ -2071,7 +2074,7 @@ def _populate_dimension_maps(self, group): # # E.g. "dim" # ------------------------------------------------ - if dimension_search in ( + if group_dimension_search in ( "furthest_ancestor", "closest_ancestor", ): @@ -2087,7 +2090,7 @@ def _populate_dimension_maps(self, group): # the ancestor group names are [/, /g1, # /g1/g2] - if dimension_search == "closest_ancestor": + if group_dimension_search == "closest_ancestor": # "closest_ancestor" searching requires # the ancestor group order to be reversed, # e.g. [/g1/g2, /g1, /] @@ -2112,7 +2115,7 @@ def _populate_dimension_maps(self, group): # group. g = group_name - elif dimension_search == "local": + elif group_dimension_search == "local": # Assume that the dimension is different to # any with same name and size defined in any # ancestor group. @@ -2120,8 +2123,8 @@ def _populate_dimension_maps(self, group): else: raise DimensionParsingException( - "Bad 'dimension_search' value: " - f"{dimension_search!r}" + "Bad 'group_dimension_search' value: " + f"{group_dimension_search!r}" ) else: g = group_separator.join(name_split[:-1]) @@ -2144,11 +2147,13 @@ def _populate_dimension_maps(self, group): # -------------------------------------------- # Relative path dimension name with upward # path traversals ('../') not at the start of - # the name + # the name. # # E.g. "/group1/../group2/dim" # E.g. "group1/../group2/dim" # E.g. "../group1/../group2/dim" + # + # Note that "../../dim" is not such a case. # -------------------------------------------- raise DimensionParsingException( "In Zarr datasets, can't yet deal with a " @@ -2157,7 +2162,8 @@ def _populate_dimension_maps(self, group): f"dataset={self.dataset_name()} " f"variable={var_name} " f"dimension_name={name}" - "\n\nPlease raise an issue at " + "\n\n" + "Please raise an issue at " "https://github.com/NCAS-CMS/cfdm/issues " "if you would like this feature." ) @@ -2170,6 +2176,7 @@ def _populate_dimension_maps(self, group): # # E.g. "../group1/dim" # E.g. "../../group1/dim" + # E.g. "../../dim" # -------------------------------------------- current_group = group while g.startswith(f"..{group_separator}"): diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index c46c44dbe..ba128a5e8 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -620,7 +620,7 @@ def dataset_open(self, dataset, flatten=True, verbose=None): flat_nc, strict=False, copy_data=False, - dimension_search=g["group_dimension_search"], + group_dimension_search=g["group_dimension_search"], ) # Store the original grouped file. This is primarily @@ -1039,18 +1039,18 @@ def read( cfa: `dict`, optional Configure the reading of CF-netCDF aggregation - datasets. See `cfdm.read` for details. + datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 cfa_write: sequence of `str`, optional Configure the reading of CF-netCDF aggregation - datasets. See `cfdm.read` for details. + datasets. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 to_memory: (sequence) of `str`, optional - Whether or not to bring data arrays into memory. See + Whether or not to bring data arrays into memory. See `cfdm.read` for details. .. versionadded:: (cfdm) 1.12.0.0 @@ -6687,7 +6687,6 @@ def _create_netcdfarray( ndim = variable.ndim shape = variable.shape - # size = self._file_variable_size(variable) size = prod(shape) if size < 2: @@ -9605,22 +9604,19 @@ def _netCDF4_group(self, nc, name): :Parameters: - nc: `netCDF4._netCDF4.Dataset` or `netCDF4._netCDF4.Group` + nc: `netCDF4.Dataset` or `h5netcdf.Group` or `zarr.Group` name: `str` :Returns: - (`netCDF4._netCDF4.Dataset` or `netCDF4._netCDF4.Group`, `str`) + 2-`tuple`: + The group object, and the relative-path variable name. **Examples** - >>> group, name = n._netCDF4_group(nc, 'time') - >>> group.name, name - ('/', 'time') - >>> group, name = n._netCDF4_group(nc, '/surfacelayer/Z') - >>> group.name, name - ('surfacelayer', 'Z') + >>> n._netCDF4_group(nc, '/forecast/count') + (. 'count') """ group = nc @@ -11650,7 +11646,7 @@ def _dask_chunks(self, array, ncvar, compressed, construct_type=None): # ------------------------------------------------------------ return dask_chunks - def _cache_data_elements(self, data, ncvar, attributes=None): + def _cache_data_elements(self, data, ncvar, attributes): """Cache selected element values. Updates *data* in-place to store its first, second, @@ -11678,6 +11674,11 @@ def _cache_data_elements(self, data, ncvar, attributes=None): The name of the netCDF variable that contains the data. + attributes: `dict` + The attributes of the netCDF variable. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `None` @@ -11695,7 +11696,6 @@ def _cache_data_elements(self, data, ncvar, attributes=None): group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - # variable = group.variables.get(name) variable = self._file_group_variables(group).get(name) else: variable = g["variables"].get(ncvar) @@ -11734,6 +11734,10 @@ def _cache_data_elements(self, data, ncvar, attributes=None): attributes=attributes, copy=False, ) + + # Get the cached values, minimising the number of "gets" on + # the dataset by not accessing the same chunk twice, where + # possible. if ndim == 1: # Also cache the second element for 1-d data, on the # assumption that they may well be dimension coordinate diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 631c6ccfb..f7d125ab5 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -873,10 +873,10 @@ def _write_count_variable( # -------------------------------------------------------- # Create the sample dimension # -------------------------------------------------------- - _ = self.implementation.nc_get_sample_dimension( + sample_ncdim = self.implementation.nc_get_sample_dimension( count_variable, "element" ) - sample_ncdim = self._name(_) + sample_ncdim = self._name(sample_ncdim) self._write_dimension( sample_ncdim, f, @@ -1416,7 +1416,6 @@ def _write_bounds( self._createDimension( parent_group, base_bounds_ncdim, size ) - # parent_group.createDimension(base_bounds_ncdim, size) except RuntimeError: raise @@ -2140,8 +2139,8 @@ def _write_scalar_coordinate( ): """Write a scalar coordinate and its bounds to the dataset. - It is assumed that the input coordinate is has size 1, but - this is not checked. + It is assumed that the input coordinate has size 1, but this + is not checked. If an equal scalar coordinate has already been written to the dataset then the input coordinate is not written. @@ -2635,10 +2634,12 @@ def _createVariable(self, **kwargs): chunks = shape if shards is not None: - # Calculate the shard shape in the format expected - # by `zarr.create_array`, i.e. shards are defined - # by how many array elements along each dimension - # are in each shard. + # create the shard shape in the format expected by + # `zarr.create_array`, 'shards' is curerntly + # defined by how many *chunks* along each + # dimension are in each shard, but `zarr required + # shards defined by how many *array elements* + # along each dimension are in each shard. if chunks == shape: # One chunk per shard. # @@ -2648,6 +2649,13 @@ def _createVariable(self, **kwargs): else: ndim = len(chunks) if isinstance(shards, Integral): + # Make a conservative estimate of how many + # whole chunks along each dimension are in + # a shard. This may result in fewer than + # 'shards' chunks in each shard, but is + # guaranteed to give us a shard shape of + # less than the data shape, which is a + # `zarr` requirement. n = int(shards ** (1 / ndim)) shards = (n,) * ndim @@ -2936,9 +2944,9 @@ def _write_netcdf_variable( ) logger.debug( - f" chunksizes: {chunksizes}\n" - f" contiguous: {contiguous}" - f" shards : {shards}" + f" chunksizes: {chunksizes!r}\n" + f" contiguous: {contiguous!r}" + f" shards : {shards!r}" ) # pragma: no cover # ------------------------------------------------------------ @@ -2976,10 +2984,11 @@ def _write_netcdf_variable( # (CF>=1.8) # ------------------------------------------------------------ if g["backend"] == "zarr": - # ... but not for Zarr. This is because Zarr doesn't have - # the concept of dimensions belonging to a group (unlike - # netCDF), so by keeping the group structure in the - # dimension names we can know which group they belong to. + # ... but not for Zarr. This is because the Zarr data + # model doesn't have the concept of dimensions belonging + # to a group (unlike netCDF), so by keeping the group + # structure in the dimension names we can know which group + # they belong to. kwargs["dimensions"] = ncdimensions else: ncdimensions_basename = [ diff --git a/cfdm/read_write/netcdf/zarr.py b/cfdm/read_write/netcdf/zarr.py index 4db149e2e..c4df9fce6 100644 --- a/cfdm/read_write/netcdf/zarr.py +++ b/cfdm/read_write/netcdf/zarr.py @@ -23,7 +23,9 @@ def __init__(self, name, size, group, reference_variable=None): The group that the dimension is a member of. reference_variable: `zarr.Array`, optional - The variable that provided the dimension defintion. + The variable that provided the dimension definition. + + .. versionadded:: (cfdm) NEXTVERSION """ self._name = name @@ -108,8 +110,8 @@ def reference_variable(self): :Returns: `zarr.Array` or `None` - The variable that provided the dimension defintion, or - `None` if it wasn't provided during instance + The variable that provided the dimension definition, + or `None` if it wasn't provided during instance initialisation. """ diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 1aeebe9d3..e07a9bf17 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -178,6 +178,11 @@ Tests are run from within the ``cfdm/test`` directory: **Dependencies** ---------------- +.. _Required: + +Required +^^^^^^^^ + The cfdm package requires: * `Python `_, version 3.9 or newer. @@ -195,8 +200,6 @@ The cfdm package requires: * `h5py `_, version 3.12.1 or newer. -* `zarr `_, version 3.1.2 or newer. - * `s3fs `_, version 2024.6.0 or newer. * `dask `_, version 2025.5.1 or newer. @@ -237,6 +240,19 @@ The cfdm package requires: then setting the ``UDUNITS2_XML_PATH`` environment variable is the likely solution. +Optional +^^^^^^^^ + +Some further dependencies that enable further functionality are +optional. This to facilitate cfdm being installed in restricted +environments for which these features are not required. + +.. rubric:: Zarr + +* `zarr `_, version 3.1.2 or newer. + + For reading and writing Zarr datasets. + ---- .. _Code-repository: diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index f3710d4ce..8c56e43f3 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -85,7 +85,8 @@ The `cfdm` package can * create new field and domain constructs in memory, -* write field and domain constructs to netCDF datasets on disk, +* write field and domain constructs to netCDF and Zarr datasets on + disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 7a07c8785..8b42c92f6 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -3465,7 +3465,7 @@ Method Classes ------------------- The `cfdm.write` function writes a field construct, or a sequence of -field constructs, to a netCDF file on disk: +field constructs, to a netCDF or Zarr dataset on disk: .. code-block:: python :caption: *Write a field construct to a netCDF dataset on disk.* @@ -3535,8 +3535,8 @@ By default the output file will be for CF-|version|. The `cfdm.write` function has optional parameters to -* set the output netCDF format (all netCDF3 and netCDF4 formats are - possible); +* set the output dataset format (all netCDF3 and netCDF4 formats, as + well as Zarr v3 are possible); * append to the netCDF file rather than over-writing it by default; From 4451f99d0db8b7e864f33eb7fe9954d9b311f4da Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 6 Oct 2025 12:00:25 +0100 Subject: [PATCH 23/39] dev --- cfdm/cfdmimplementation.py | 4 +- cfdm/docstring/docstring.py | 81 +++++++++++--------- cfdm/mixin/netcdf.py | 14 ++-- cfdm/read_write/netcdf/flatten/flatten.py | 90 ++++++++++++++--------- cfdm/read_write/netcdf/netcdfread.py | 28 +++---- cfdm/read_write/read.py | 2 +- cfdm/test/test_zarr.py | 23 +++++- 7 files changed, 144 insertions(+), 98 deletions(-) diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 214451d85..9dd2ab3d2 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1104,7 +1104,7 @@ def nc_get_dataset_chunksizes(self, data): return data.nc_dataset_chunksizes() def nc_get_dataset_shards(self, data): - """Get the dataset sharding strategy for the data. + """Get the Zarr dataset sharding strategy for the data. ..versionadded:: (cfdm) NEXTVERSION @@ -1263,7 +1263,7 @@ def nc_set_hdf5_chunksizes(self, data, chunksizes): ) def nc_set_dataset_shards(self, data, shards): - """Set the dataset sharding strategy for the data. + """Set the Zarr dataset sharding strategy for the data. ..versionadded:: (cfdm) NEXTVERSION diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index f3adb4998..30792ab4a 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -726,49 +726,60 @@ parent directory of itself.""", # read group_dimension_search "{{read group_dimension_search: `str`, optional}}": """group_dimension_search: `str`, optional - How to interpret a sub-group dimension name that has no - path, i.e. that contains no group-separator characters, - such as ``dim`` (as opposed to ``group/dim``, - ``/group/dim``, ``../dim``, etc.). Such a dimension name - could be a variable array dimension name, or be referenced - by variable attribute. - - This is only required for reading a Zarr dataset, for - which there is no means of indicating whether the same - dimension names that appear in different groups correspond - to each other, or not. - - For a non-Zarr dataset that adheres to the netCDF data - model (such as a netCDF-4 dataset), - *group_dimension_search* **is ignored** because any - correspondence between dimensions is already explicitly - defined. - - The *group_dimension_search* parameter must be one of: - * ``'furthest_ancestor'`` - - This is the default. Assume that the Zarr sub-group - dimension is the same as the one with the same name and - size in an ancestor group, if one exists. If multiple - such dimensions exist, then the correspondence is with - the dimension in the ancestor group that is **furthest - away** from the sub-group (i.e. that is closest to the - root group). + How to interpret a dimension name that contains no + group-separator characters, such as ``dim`` (as opposed to + ``group/dim``, ``/group/dim``, ``../dim``, etc.). The + *group_dimension_search* parameter must be one of: * ``'closet_ancestor'`` - Assume that the Zarr sub-group dimension is the same as - the dimension with the same name and size in an ancestor + This is the default and is the behaviour defined by the + CF conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + dimension with the same name and size in an ancestor group, if one exists. If multiple such dimensions exist, then the correspondence is with the dimension in the - ancestor group that is **closest to** the sub-group + ancestor group that is **closest** to the sub-group (i.e. that is furthest away from the root group). + * ``'furthest_ancestor'`` + + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + one with the same name and size in an ancestor group, if + one exists. If multiple such dimensions exist, then the + correspondence is with the dimension in the ancestor + group that is **furthest away** from the sub-group + (i.e. that is closest to the root group). + * ``'local'`` + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + Assume that the Zarr sub-group dimension is different to - any with the same name and size in all ancestor groups.""", + any with the same name and size in all ancestor groups. + + .. note:: For netCDF dataset, for which it is inherently + well-defined in which group a dimension is + defined, *group_dimension_search* may only take + the default value of ``'closet_ancestor'`, which + applies the behaviour defined by the CF + conventions (section 2.7 Groups). + + For a Zarr dataset, for which there is no means + of indicating whether or not the same dimension + names that appear in different groups correspond + to each other, setting this parameter may be + necessary for the correct interpretation of the + dataset in the event that its dimensions are + named in a manner that is inconsistent with CF + rules defined by the CF conventions (section 2.7 + Groups).""", # persist "{{persist description}}": """Persisting turns an underlying lazy dask array into an equivalent chunked dask array, but now with the results fully @@ -1384,9 +1395,9 @@ to store multiple chunks in a single storage object or file. This can be useful because traditional file systems and object storage systems may have performance issues storing and - accessing many files. Additionally, small files can be - inefficient to store if they are smaller than the block size - of the file system. + accessing large number of files. Additionally, small files can + be inefficient to store if they are smaller than the block + size of the file system. The sharding strategy is ignored when writing to a non-Zarr dataset.""", diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index bb427e1cd..40889610e 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -5133,14 +5133,14 @@ def nc_set_aggregation_write_status(self, status): class NetCDFShards(NetCDFMixin): - """Mixin class for accessing dataset shard size. + """Mixin class for accessing Zarr dataset shard size. When writing to a Zarr dataset, sharding provides a mechanism to store multiple chunks in a single storage object or file. This can be useful because traditional file systems and object storage - systems may have performance issues storing and accessing many - files. Additionally, small files can be inefficient to store if - they are smaller than the block size of the file system. + systems may have performance issues storing and accessing a large + number of files. Additionally, small files can be inefficient to + store if they are smaller than the block size of the file system. The sharding strategy is ignored when writing to a non-Zarr dataset. @@ -5150,7 +5150,7 @@ class NetCDFShards(NetCDFMixin): """ def nc_dataset_shards(self): - """Get the dataset shard size for the data. + """Get the Zarr dataset shard size for the data. {{sharding description}} @@ -5188,7 +5188,7 @@ def nc_dataset_shards(self): return self._get_netcdf().get("dataset_shards") def nc_clear_dataset_shards(self): - """Clear the dataset shard size for the data. + """Clear the Zarr dataset shard size for the data. {{sharding description}} @@ -5225,7 +5225,7 @@ def nc_clear_dataset_shards(self): return self._get_netcdf().pop("dataset_shards", None) def nc_set_dataset_shards(self, shards): - """Set the dataset sharding strategy for the data. + """Set the Zarr dataset sharding strategy for the data. {{sharding description}} diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 5444aa727..ff9ba5012 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -52,7 +52,7 @@ def netcdf_flatten( output_ds, strict=True, copy_data=True, - group_dimension_search="furthest_ancestor", + group_dimension_search="closest_ancestor", ): """Create a flattened version of a grouped CF dataset. @@ -108,51 +108,61 @@ def netcdf_flatten( actually create these arrays in memory or on disk. group_dimension_search: `str`, optional - How to interpret a sub-group dimension name that has no - path, i.e. that contains no group-separator characters, - such as ``dim`` (as opposed to ``group/dim``, - ``/group/dim``, ``../dim``, etc.). Such a dimension name - could be a variable array dimension name, or be referenced - by variable attribute. - - This is only required for reading a Zarr dataset, for - which there is no means of indicating whether the same - dimension names that appear in different groups correspond - to each other, or not. - - For a non-Zarr dataset that adheres to the netCDF data - model (such as a netCDF-4 dataset), - *group_dimension_search* **is ignored** because any - correspondence between dimensions is already explicitly - defined. - - The *group_dimension_search* parameter must be one of: - - * ``'furthest_ancestor'`` - - This is the default. Assume that the Zarr sub-group - dimension is the same as the one with the same name and - size in an ancestor group, if one exists. If multiple - such dimensions exist, then the correspondence is with - the dimension in the ancestor group that is **furthest - away** from the sub-group (i.e. that is closest to the - root group). + How to interpret a dimension name that contains no + group-separator characters, such as ``dim`` (as opposed to + ``group/dim``, ``/group/dim``, ``../dim``, etc.). The + *group_dimension_search* parameter must be one of: * ``'closet_ancestor'`` - Assume that the Zarr sub-group dimension is the same as - the dimension with the same name and size in an ancestor + This is the default and is the behaviour defined by the + CF conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + dimension with the same name and size in an ancestor group, if one exists. If multiple such dimensions exist, then the correspondence is with the dimension in the - ancestor group that is **closest to** the sub-group + ancestor group that is **closest** to the sub-group (i.e. that is furthest away from the root group). + * ``'furthest_ancestor'`` + + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + + Assume that the sub-group dimension is the same as the + one with the same name and size in an ancestor group, if + one exists. If multiple such dimensions exist, then the + correspondence is with the dimension in the ancestor + group that is **furthest away** from the sub-group + (i.e. that is closest to the root group). + * ``'local'`` + This behaviour is different to that defined by the CF + conventions (section 2.7 Groups). + Assume that the Zarr sub-group dimension is different to any with the same name and size in all ancestor groups. - .. versionadded:: (cfdm) NEXTVERSION + .. note:: For netCDF dataset, for which it is inherently + well-defined in which group a dimension is + defined, *group_dimension_search* may only take + the default value of ``'closet_ancestor'`, which + applies the behaviour defined by the CF + conventions (section 2.7 Groups). + + For a Zarr dataset, for which there is no means + of indicating whether or not the same dimension + names that appear in different groups correspond + to each other, setting this parameter may be + necessary for the correct interpretation of the + dataset in the event that its dimensions are + named in a manner that is inconsistent with CF + rules defined by the CF conventions (section 2.7 + Groups). + + .. versionadded:: (cfdm) NEXTVERSION :Returns: @@ -294,7 +304,7 @@ def __init__( output_ds, strict=True, copy_data=True, - group_dimension_search="furthest_ancestor", + group_dimension_search="closest_ancestor", ): """**Initialisation** @@ -1936,6 +1946,14 @@ def _group_dimensions(self, group): """ match self._backend(): case "h5netcdf" | "netCDF4": + if self._group_dimension_search != "closest_ancestor": + raise ValueError( + f"For netCDF dataset {self.dataset_name()}, " + "group_dimension_search keyword must be " + "'closest_ancestor'. " + f"Got {self._group_dimension_search!r}" + ) + return group.dimensions case "zarr": @@ -2075,8 +2093,8 @@ def _populate_dimension_maps(self, group): # E.g. "dim" # ------------------------------------------------ if group_dimension_search in ( - "furthest_ancestor", "closest_ancestor", + "furthest_ancestor", ): # Find the names of all ancestor groups, in # the appropriate order for searching. diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index ba128a5e8..cd4ab8371 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -952,7 +952,7 @@ def read( dataset_type=None, cdl_string=False, ignore_unknown_type=False, - group_dimension_search="furthest_ancestor", + group_dimension_search="closest_ancestor", ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -9616,7 +9616,7 @@ def _netCDF4_group(self, nc, name): **Examples** >>> n._netCDF4_group(nc, '/forecast/count') - (. 'count') + (, 'count') """ group = nc @@ -11203,17 +11203,19 @@ def _file_variable_dimensions(self, var): return var.dimensions case "zarr": - try: - # Zarr v3 - dimension_names = var.metadata.dimension_names - if dimension_names is None: - # Scalar variable - dimension_names = () - - return dimension_names - except AttributeError: - # Zarr v2 - return tuple(var.attrs["_ARRAY_DIMENSIONS"]) + match var.metadata.zarr_format: + case 3: + # Zarr v3 + dimension_names = var.metadata.dimension_names + if dimension_names is None: + # Scalar variable + dimension_names = () + + return dimension_names + + case 2: + # Zarr v2 + return tuple(var.attrs["_ARRAY_DIMENSIONS"]) def _get_storage_options(self, dataset, parsed_dataset): """Get the storage options for accessing a file. diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index f7a29a143..0e3d44ebc 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -257,7 +257,7 @@ def __new__( followlinks=False, cdl_string=False, extra_read_vars=None, - group_dimension_search="furthest_ancestor", + group_dimension_search="closest_ancestor", **kwargs, ): """Read field or domain constructs from datasets. diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index 082c94af3..0379f6630 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -219,7 +219,7 @@ def test_zarr_groups_1(self): self.assertTrue(z1.equals(f)) def test_zarr_groups_dimension(self): - """Test the dimensions of Zarr hierarchical groups.""" + """Test Zarr groups dimensions.""" f = self.f0.copy() grouped_dir = tmpdir1 @@ -248,8 +248,15 @@ def test_zarr_groups_dimension(self): self.assertTrue(z.equals(n)) self.assertTrue(z.equals(f)) - def test_zarr_groups_compression(self): - """Test the compression of Zarr hierarchical groups.""" + # Check that grouped netCDF datasets can only be read with + # 'closest_ancestor' + cfdm.read(grouped_file, group_dimension_search="closest_ancestor") + for gsn in ("furthest_ancestor", "local", "BAD VALUE"): + with self.assertRaises(ValueError): + cfdm.read(grouped_file, group_dimension_search=gsn) + + def test_zarr_groups_DSG(self): + """Test Zarr groups containing DSGs.""" f = cfdm.example_field(4) grouped_dir = "tmpdir1" @@ -283,7 +290,7 @@ def test_zarr_groups_compression(self): self.assertTrue(z.equals(f)) def test_zarr_groups_geometry(self): - """Test that geometries in Zarr groups.""" + """Test Zarr groups containing cell geometries.""" f = cfdm.example_field(6) grouped_dir = tmpdir1 @@ -310,6 +317,14 @@ def test_zarr_groups_geometry(self): self.assertTrue(z.equals(n)) self.assertTrue(z.equals(f)) + def test_zarr_read_v2(self): + """Test reading Zarr v2.""" + f2 = cfdm.read("example_field_0.zarr2") + f3 = cfdm.read("example_field_0.zarr3") + self.assertEqual(len(f2), len(f3)) + self.assertEqual(len(f2), 1) + self.assertTrue(f2[0].equals(f3[0])) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) From 2c902459836d5688a31f36fb4f97ddbf1cf2763b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 6 Oct 2025 14:03:20 +0100 Subject: [PATCH 24/39] dev --- cfdm/docstring/docstring.py | 2 +- docs/source/installation.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 30792ab4a..39d295360 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -562,7 +562,7 @@ at the time of writing.""", # read store_dataset_shards "{{read store_dataset_shards: `bool`, optional}}": """store_dataset_shards: `bool`, optional - If True (the default) then store the dataset sharding + If True (the default) then store the Zarr dataset sharding strategy for each returned data array. The dataset sharding strategy is then accessible via an object's `nc_dataset_shards` method. When the dataset sharding diff --git a/docs/source/installation.rst b/docs/source/installation.rst index e07a9bf17..5238faf51 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -252,7 +252,7 @@ environments for which these features are not required. * `zarr `_, version 3.1.2 or newer. For reading and writing Zarr datasets. - + ---- .. _Code-repository: From 3b4993c1e372ff1472b20b844e40e1ec6edc7162 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Oct 2025 08:36:20 +0100 Subject: [PATCH 25/39] dev --- cfdm/read_write/netcdf/netcdfwrite.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index f7d125ab5..d2e2d4c4b 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2596,8 +2596,10 @@ def _createVariable(self, **kwargs): if "dimensions" not in kwargs: netcdf4_kwargs["dimensions"] = () + contiguous = kwargs.get("contiguous") + NETCDF4 = g["dataset"].data_model.startswith("NETCDF4") - if NETCDF4 and kwargs.get("contiguous"): + if NETCDF4 and contiguous: # NETCDF4 contiguous variables can't be compressed kwargs["compression"] = None kwargs["complevel"] = 0 @@ -2618,6 +2620,9 @@ def _createVariable(self, **kwargs): f"{unlimited_dimensions}" ) + if contiguous: + netcdf4_kwargs.pop("fletcher32", None) + # Remove Zarr-specific kwargs netcdf4_kwargs.pop("shape", None) netcdf4_kwargs.pop("shards", None) @@ -2637,11 +2642,11 @@ def _createVariable(self, **kwargs): # create the shard shape in the format expected by # `zarr.create_array`, 'shards' is curerntly # defined by how many *chunks* along each - # dimension are in each shard, but `zarr required + # dimension are in each shard, but `zarr requires # shards defined by how many *array elements* # along each dimension are in each shard. if chunks == shape: - # One chunk per shard. + # One chunk # # It doesn't matter what 'shards' is, because # the data only has one chunk. From 33dc03b5463448258f4d31a178de51926bf3c242 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 13 Oct 2025 12:53:24 +0100 Subject: [PATCH 26/39] deprecated netcdf_flatten --- cfdm/functions.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/cfdm/functions.py b/cfdm/functions.py index 799cb7b93..a5ad47fc0 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -2398,6 +2398,18 @@ def parse_indices(shape, indices, keepdims=True, newaxis=False): return parsed_indices +def netcdf_flatten(*args, **kwargs): + """Create a flattened version of a grouped CF dataset. + + .. versionadded:: (cfdm) 1.11.2.0 + + """ + raise DeprecationError( + "Function 'netcdf_flatten' has been renamed 'dataset_flatten' " + "at version NEXTVERSION" + ) + + def _DEPRECATION_ERROR_KWARGS( instance, method, From fef739fd29e5818c5211fa767efbd50d941371de Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 13 Oct 2025 14:24:43 +0100 Subject: [PATCH 27/39] dev --- cfdm/__init__.py | 3 +- cfdm/data/netcdfindexer.py | 2 +- cfdm/docstring/docstring.py | 9 ++-- cfdm/read_write/netcdf/flatten/__init__.py | 2 +- cfdm/read_write/netcdf/flatten/flatten.py | 60 ++++++++++++---------- cfdm/read_write/netcdf/netcdfread.py | 6 +-- cfdm/read_write/netcdf/netcdfwrite.py | 12 ++--- cfdm/read_write/read.py | 3 +- cfdm/read_write/write.py | 4 +- docs/source/api_reference.rst | 2 +- docs/source/functions.rst | 4 +- 11 files changed, 57 insertions(+), 50 deletions(-) diff --git a/cfdm/__init__.py b/cfdm/__init__.py index c7f6ca659..b84152327 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -224,6 +224,7 @@ is_log_level_debug, is_log_level_detail, is_log_level_info, + netcdf_flatten, ) # Though these are internal-use methods, include them in the namespace @@ -310,7 +311,7 @@ from .cfdmimplementation import CFDMImplementation, implementation from .read_write import read, write -from .read_write.netcdf.flatten import netcdf_flatten +from .read_write.netcdf.flatten import dataset_flatten from .examplefield import example_field, example_fields, example_domain diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index e50fbfd68..43d795605 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -35,7 +35,7 @@ class netcdf_indexer: """A data indexer that also applies netCDF masking and unpacking. - Here "netCDF4" refers to the API of the netCDF data model, rather + Here "netCDF" refers to the API of the netCDF data model, rather than any particular dataset encoding or software library API. Indexing may be orthogonal or non-orthogonal. Orthogonal indexing diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 39d295360..5a77028de 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -726,13 +726,12 @@ parent directory of itself.""", # read group_dimension_search "{{read group_dimension_search: `str`, optional}}": """group_dimension_search: `str`, optional - How to interpret a dimension name that contains no group-separator characters, such as ``dim`` (as opposed to ``group/dim``, ``/group/dim``, ``../dim``, etc.). The *group_dimension_search* parameter must be one of: - * ``'closet_ancestor'`` + * ``'closest_ancestor'`` This is the default and is the behaviour defined by the CF conventions (section 2.7 Groups). @@ -764,11 +763,11 @@ Assume that the Zarr sub-group dimension is different to any with the same name and size in all ancestor groups. - .. note:: For netCDF dataset, for which it is inherently + .. note:: For a netCDF dataset, for which it is always well-defined in which group a dimension is defined, *group_dimension_search* may only take - the default value of ``'closet_ancestor'`, which - applies the behaviour defined by the CF + the default value of ``'closest_ancestor'`, + which applies the behaviour defined by the CF conventions (section 2.7 Groups). For a Zarr dataset, for which there is no means diff --git a/cfdm/read_write/netcdf/flatten/__init__.py b/cfdm/read_write/netcdf/flatten/__init__.py index 82e6a3c9e..6c106911d 100644 --- a/cfdm/read_write/netcdf/flatten/__init__.py +++ b/cfdm/read_write/netcdf/flatten/__init__.py @@ -13,4 +13,4 @@ """ -from .flatten import netcdf_flatten +from .flatten import dataset_flatten diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index ff9ba5012..c671c2a0f 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -47,7 +47,7 @@ referencing_attributes = set(flattening_rules) -def netcdf_flatten( +def dataset_flatten( input_ds, output_ds, strict=True, @@ -56,22 +56,24 @@ def netcdf_flatten( ): """Create a flattened version of a grouped CF dataset. - **CF-netCDF coordinate variables** + The following dataset formats can be flattened: netCDF and Zarr. - When a CF-netCDF coordinate variable in the input dataset is in a + **CF coordinate variables** + + When a CF coordinate variable (i.e. a one-dimensional variable + with the same name as its dimension) in the input dataset is in a different group to its corresponding dimension, the same variable - in the output flattened dataset will no longer be a CF-netCDF - coordinate variable, as its name will be prefixed with a different - group identifier than its dimension. + in the output flattened dataset will no longer be a CF coordinate + variable, as its name will be prefixed with a different group + identifier than its dimension. In such cases it is up to the user to apply the proximal and lateral search algorithms to the flattened dataset returned by - `netcdf_flatten`, in conjunction with the mappings defined in the + `dataset_flatten`, in conjunction with the mappings defined in the newly created global attributes ``_flattener_variable_map`` and - ``_flattener_dimension_map``, to find which netCDF variables are - acting as CF coordinate variables in the flattened dataset. See - https://cfconventions.org/cf-conventions/cf-conventions.html#groups - for details. + ``_flattener_dimension_map``, to find which variables are acting + as CF coordinate variables in the flattened dataset. See CF + conventions section 2.7 Groups for details. For example, if an input dataset has dimension ``lat`` in the root group and coordinate variable ``lat(lat)`` in group ``/group1``, @@ -87,9 +89,9 @@ def netcdf_flatten( :Parameters: input_ds: - The dataset to be flattened. Must be an object with the - the same API as `netCDF4.Dataset` or `h5netcdf.File`, or - else a `zarr.Group` object. + The dataset to be flattened. Must be an open dataet object + with the same API as `netCDF4.Dataset`, `h5netcdf.File`, + or `zarr.Group`. output_ds: `netCDF4.Dataset` A container for the flattened dataset that will get @@ -113,7 +115,7 @@ def netcdf_flatten( ``group/dim``, ``/group/dim``, ``../dim``, etc.). The *group_dimension_search* parameter must be one of: - * ``'closet_ancestor'`` + * ``'closest_ancestor'`` This is the default and is the behaviour defined by the CF conventions (section 2.7 Groups). @@ -142,14 +144,14 @@ def netcdf_flatten( This behaviour is different to that defined by the CF conventions (section 2.7 Groups). - Assume that the Zarr sub-group dimension is different to - any with the same name and size in all ancestor groups. + Assume that the sub-group dimension is different to any + with the same name and size in all ancestor groups. - .. note:: For netCDF dataset, for which it is inherently + .. note:: For a netCDF dataset, for which it is always well-defined in which group a dimension is defined, *group_dimension_search* may only take - the default value of ``'closet_ancestor'`, which - applies the behaviour defined by the CF + the default value of ``'closest_ancestor'`, + which applies the behaviour defined by the CF conventions (section 2.7 Groups). For a Zarr dataset, for which there is no means @@ -292,7 +294,7 @@ class _Flattener: Contains the input file, the output file being flattened, and all the logic of the flattening process. - See `netcdf_flatten` for detais. + See `dataset_flatten` for detais. .. versionadded:: (cfdm) 1.11.2.0 @@ -319,13 +321,13 @@ def __init__( A container for the flattened dataset. strict: `bool`, optional - See `netcdf_flatten`. + See `dataset_flatten`. copy_data: `bool`, optional - See `netcdf_flatten`. + See `dataset_flatten`. group_dimension_search: `str`, optional - See `netcdf_flatten`. + See `dataset_flatten`. .. versionadded:: (cfdm) NEXTVERSION @@ -2032,7 +2034,7 @@ def _populate_dimension_maps(self, group): information is not explicitly defined in the format's data model (unlike for netCDF and HDF5 datasets). - See `netcdf_flatten` for details + See `dataset_flatten` for details .. versionadded:: (cfdm) NEXTVERSION @@ -2088,7 +2090,9 @@ def _populate_dimension_maps(self, group): if group_separator not in name: # ------------------------------------------------ # Relative path dimension name which contains no - # '/' characters. + # '/' characters. The behaviour depends on the + # search algorithm defined by + # 'group_dimension_search'. # # E.g. "dim" # ------------------------------------------------ @@ -2164,8 +2168,8 @@ def _populate_dimension_maps(self, group): elif f"{group_separator}..{group_separator}" in name: # -------------------------------------------- # Relative path dimension name with upward - # path traversals ('../') not at the start of - # the name. + # path traversals ('../') *not* at the start + # of the name. # # E.g. "/group1/../group2/dim" # E.g. "group1/../group2/dim" diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 4d3bcf6de..01d95c627 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -32,7 +32,7 @@ NETCDF_MAGIC_NUMBERS, NETCDF_QUANTIZATION_PARAMETERS, ) -from .flatten import netcdf_flatten +from .flatten import dataset_flatten from .flatten.config import ( flattener_attribute_map, flattener_dimension_map, @@ -615,7 +615,7 @@ def dataset_open(self, dataset, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - netcdf_flatten( + dataset_flatten( nc, flat_nc, strict=False, @@ -954,7 +954,7 @@ def read( ignore_unknown_type=False, group_dimension_search="closest_ancestor", ): - """Reads a netCDF dataset from file or OPenDAP URL. + """Reads a netCDF or Zarr dataset from file or OPenDAP URL. Read fields from a netCDF file on disk or from an OPeNDAP server location. diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index a1180b38c..22d9b3f12 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -6629,13 +6629,13 @@ def _cfa_fragment_array_variables(self, data, cfvar): authority="", path=uri.path, ) - fragment = uri.fragment - if fragment is not None: + uri_fragment = uri.fragment + if uri_fragment is not None: # Append a URI fragment. Do this with a # string-append, rather than via # `uricompose` in case the fragment # contains more than one # character. - aggregation_file_directory += f"#{fragment}" + aggregation_file_directory += f"#{uri_fragment}" g["aggregation_file_directory"] = ( aggregation_file_directory @@ -6697,13 +6697,13 @@ def _cfa_fragment_array_variables(self, data, cfvar): authority="", path=uri.path, ) - fragment = uri.fragment - if fragment is not None: + uri_fragment = uri.fragment + if uri_fragment is not None: # Append a URI fragment. Do this with a # string-append, rather than via `uricompose` # in case the fragment contains more than one # # character. - filename += f"#{fragment}" + dataset_name += f"#{uri_fragment}" if uri_relative: scheme = uri.scheme diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 8e6684f28..c2d9ce214 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -18,7 +18,8 @@ class read(ReadWrite): """Read field or domain constructs from a dataset. - The following file formats are supported: netCDF, CDL, and Zarr. + The following dataset formats are supported: netCDF, CDL, and + Zarr. NetCDF and Zarr datasets may be on local disk, on an OPeNDAP server, or in an S3 object store. diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index e67105ae6..10ba7468c 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -6,7 +6,9 @@ class write(ReadWrite): - """Write field and domain constructs to a netCDF or Zarr dataset. + """Write field and domain constructs to a dataset. + + The following dataset formats are supported: netCDF and Zarr. **Dataset format** diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst index 73a9b20c4..f22386f4c 100644 --- a/docs/source/api_reference.rst +++ b/docs/source/api_reference.rst @@ -16,7 +16,7 @@ Version |release| for version |version| of the CF conventions. * :py:class:`cfdm.AuxiliaryCoordinate` * :py:class:`cfdm.CellMeasure` * :py:class:`cfdm.CellMethod` - * :py:class:`cfdm.Coordinatereference` + * :py:class:`cfdm.CoordinateReference` * :py:class:`cfdm.DimensionCoordinate` * :py:class:`cfdm.DomainAncillary` * :py:class:`cfdm.DomainAxis` diff --git a/docs/source/functions.rst b/docs/source/functions.rst index e26e414c7..d0f9f42a7 100644 --- a/docs/source/functions.rst +++ b/docs/source/functions.rst @@ -20,8 +20,8 @@ Reading and writing cfdm.read cfdm.write - cfdm.netcdf_flatten - cfdm.netcdf_index + cfdm.dataset_flatten + cfdm.netcdf_indexer Mathematical operations ----------------------- From cd9aec742e421e29ad3ba97782eac0839f48d0f7 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 20 Oct 2025 11:20:05 +0100 Subject: [PATCH 28/39] dev --- cfdm/data/aggregatedarray.py | 2 +- cfdm/docstring/docstring.py | 35 ++++++++++++----------- cfdm/read_write/netcdf/flatten/flatten.py | 8 +++--- cfdm/read_write/netcdf/netcdfread.py | 1 - cfdm/read_write/netcdf/netcdfwrite.py | 17 +---------- cfdm/read_write/read.py | 5 +++- cfdm/read_write/write.py | 24 +++++++++------- 7 files changed, 42 insertions(+), 50 deletions(-) diff --git a/cfdm/data/aggregatedarray.py b/cfdm/data/aggregatedarray.py index 41f0c0641..78500da90 100644 --- a/cfdm/data/aggregatedarray.py +++ b/cfdm/data/aggregatedarray.py @@ -265,7 +265,7 @@ def _parse_fragment_array(self, aggregated_filename, fragment_array): uri = fa_uris[index] try: - # 'uri' is a size 1 numpy array + # 'uri' is scalar numpy string type uri = uri.item() except AttributeError: # E.g. 'uri' is a `str` instance diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 5a77028de..3030e10db 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -571,12 +571,12 @@ (unless the strategy is modified prior to writing). If False, or if the dataset being read does not support - sharding (such as a netCDF dataset), then no dataset - sharding strategy is stored (i.e. the `nc_dataset_shards` - method will return `None` for all returned `Data` - objects). In this case, when the data is written to a new - Zarr dataset, the dataset sharding strategy will be - determined by `{{package}}.write`.""", + sharding (such as a Zarr v2 or netCDF dataset), then no + dataset sharding strategy is stored (i.e. the + `nc_dataset_shards` method will return `None` for all + returned `Data` objects). In this case, when the data is + written to a new Zarr dataset, the dataset sharding + strategy will be determined by `{{package}}.write`.""", # read cfa "{{read cfa: `dict`, optional}}": """cfa: `dict`, optional Configure the reading of CF-netCDF aggregation files. @@ -1390,13 +1390,16 @@ replacement. If False (the default) then no normalisation is done.""", # sharding - "{{sharding description}}": """When writing to a Zarr dataset, sharding provides a mechanism - to store multiple chunks in a single storage object or - file. This can be useful because traditional file systems and - object storage systems may have performance issues storing and - accessing large number of files. Additionally, small files can - be inefficient to store if they are smaller than the block - size of the file system. + "{{sharding description}}": """ + When writing to a Zarr dataset, sharding provides a mechanism + to store multiple dataset chunks in a single storage object or + file. Without sharding, each dataset chunk is written to its + own file. Traditional file systems and object storage systems + may have performance issues storing and accessing large number + of files, and small files can be inefficient to store if they + are smaller than the block size of the file system. Sharding + can improve performance by creating fewer, and larger, files + for storing the dataset chunks. The sharding strategy is ignored when writing to a non-Zarr dataset.""", @@ -1434,9 +1437,9 @@ * `int` - The integer number of chunks to be stored in a - single shard, favouring an equal number of chunks - along each shard dimension. + The integer number of dataset chunks to be stored in + a single shard, favouring an equal number of dataset + chunks along each shard dimension. * sequence of `int` diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index c671c2a0f..7bf837b65 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -1951,7 +1951,7 @@ def _group_dimensions(self, group): if self._group_dimension_search != "closest_ancestor": raise ValueError( f"For netCDF dataset {self.dataset_name()}, " - "group_dimension_search keyword must be " + "the group_dimension_search keyword must be " "'closest_ancestor'. " f"Got {self._group_dimension_search!r}" ) @@ -2057,7 +2057,7 @@ def _populate_dimension_maps(self, group): var_to_dims = self._var_to_dims group_dimension_search = self._group_dimension_search - # Initialise mapping from the group to its ZarrDimension + # Initialise the mapping from this group to its ZarrDimension # objects. Use 'setdefault' because a previous call to # `_populate_dimension_maps` might already have done this. group_to_dims.setdefault(group_name, {}) @@ -2244,7 +2244,7 @@ def _populate_dimension_maps(self, group): zarr_dim = None if g in group_to_dims: - # Group 'g' is already registered + # Group 'g' is already registered in the mapping zarr_dim = group_to_dims[g].get(basename) if zarr_dim is not None: # Dimension 'basename' is already registered @@ -2260,7 +2260,7 @@ def _populate_dimension_maps(self, group): f"dimension_name={name}" ) else: - # Initialise group 'g' + # Initialise group 'g' in the mapping group_to_dims[g] = {} if zarr_dim is None: diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 01d95c627..1b955b90f 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -6666,7 +6666,6 @@ def _create_netcdfarray( group, name = self._netCDF4_group( g["variable_grouped_dataset"][ncvar], ncvar ) - # variable = group.variables.get(name) variable = self._file_group_variables(group).get(name) else: diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 22d9b3f12..27ac92f4d 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -1366,9 +1366,6 @@ def _write_bounds( size = data.shape[-1] - # bounds_ncdim = self._name('bounds{0}'.format(size), - # dimsize=size, role='bounds') - bounds_ncdim = self.implementation.nc_get_dimension( bounds, f"bounds{size}" ) @@ -1827,7 +1824,6 @@ def _parent_group(self, name): ) for group_name in name.split("/")[1:-1]: - # if group_name not in parent_group: parent_group = self._createGroup(parent_group, group_name) return parent_group @@ -4883,16 +4879,11 @@ def _write_global_attributes(self, fields): if not g["dry_run"] and not g["post_dry_run"]: attrs = {"Conventions": delimiter.join(g["Conventions"])} - # g["dataset"].setncattr( - # "Conventions", delimiter.join(g["Conventions"]) - # ) # ------------------------------------------------------------ # Write the file descriptors to the dataset # ------------------------------------------------------------ attrs.update(g["file_descriptors"]) - # for attr, value in g["file_descriptors"].items(): - # g["dataset"].setncattr(attr, value) # ------------------------------------------------------------ # Write other global attributes to the dataset @@ -4903,10 +4894,6 @@ def _write_global_attributes(self, fields): for attr in global_attributes - set(("Conventions",)) } ) - # for attr in global_attributes - set(("Conventions",)): - # g["dataset"].setncattr( - # attr, self.implementation.get_property(f0, attr) - # ) # ------------------------------------------------------------ # Write "forced" global attributes to the dataset @@ -4914,8 +4901,6 @@ def _write_global_attributes(self, fields): attrs.update(force_global) self._set_attributes(attrs, group=g["dataset"]) - # for attr, v in force_global.items(): - # g["dataset"].setncattr(attr, v) g["global_attributes"] = global_attributes @@ -5120,7 +5105,7 @@ def write( *mode* Description ======== ================================================= - ``'w'``-- Open a new dataset for writing to. If it + ``'w'`` Open a new dataset for writing to. If it exists and *overwrite* is True then the dataset is deleted prior to being recreated. diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index c2d9ce214..d7d517eb1 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -380,7 +380,10 @@ def _datasets(self): if isdir(x): if is_zarr(x): # This directory is a Zarr dataset, so don't - # look in any subdirectories. + # look in any subdirectories, which we contain + # the dataset chunks (but note - it is allowed + # for non-chunk subdirectories to exist, but + # if they do we're going to ignore them!). n_datasets += 1 yield x continue diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 10ba7468c..f75ec5e37 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -619,13 +619,15 @@ class write(ReadWrite): dataset_shards: `None` or `int`, optional When writing to a Zarr dataset, sharding provides a - mechanism to store multiple chunks in a single storage - object or file. This can be useful because traditional - file systems and object storage systems may have - performance issues storing and accessing many - files. Additionally, small files can be inefficient to - store if they are smaller than the block size of the file - system. + mechanism to store multiple dataset chunks in a single + storage object or file. Without sharding, each dataset + chunk is written to its own file. Traditional file systems + and object storage systems may have performance issues + storing and accessing large number of files, and small + files can be inefficient to store if they are smaller than + the block size of the file system. Sharding can improve + performance by creating fewer, and larger, files for + storing the dataset chunks. The *dataset_shards* parameter is ignored when writing to a non-Zarr datset. @@ -644,12 +646,12 @@ class write(ReadWrite): * `int` - The integer number of chunks to be stored in a single - shard, favouring an equal number of chunks along each - shard dimenson. + The integer number of dataset chunks to be stored in a + single shard, favouring an equal number of dataset + chunks along each shard dimenson. *Example:* - For two-dimensional `Data`, ``dataset_shards=9`` will + For two-dimensional data, ``dataset_shards=9`` will result in shards that span 3 chunks along each dimension. From 8616dcf7b6b702062a0ac7d228a3c5cb59304c57 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 20 Oct 2025 17:51:36 +0100 Subject: [PATCH 29/39] dev --- Changelog.rst | 3 +-- cfdm/test/test_zarr.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index e33a7ee87..f8d466b7b 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -12,7 +12,7 @@ Version NEXTVERSION * Removed dependency (now optional): ``zarr>=3.0.8`` ---- - + Version 1.12.3.1 ---------------- @@ -34,7 +34,6 @@ Version 1.12.3.0 **2025-08-18** - * Fix `cfdm.Data.reshape` when the underlying data originate on disk (https://github.com/NCAS-CMS/cfdm/issues/348) * New keyword parameter to `cfdm.Field.dump`: ``data`` diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index 0379f6630..e653a71ec 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -259,8 +259,8 @@ def test_zarr_groups_DSG(self): """Test Zarr groups containing DSGs.""" f = cfdm.example_field(4) - grouped_dir = "tmpdir1" - grouped_file = "tmpfile1.nc" + grouped_dir = tmpdir1 + grouped_file = tmpfile1 f.compress("indexed_contiguous", inplace=True) f.data.get_count().nc_set_variable("count") From 1b95e5c3276f7c193cbfffc79cbb9054a7073499 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 27 Oct 2025 13:49:50 +0000 Subject: [PATCH 30/39] dev --- cfdm/read_write/netcdf/netcdfread.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 1b955b90f..cf2b8a17f 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -11772,7 +11772,6 @@ def _cache_data_elements(self, data, ncvar, attributes): v = variable[...] else: v = variable - index = (slice(0, 1),) * ndim1 + (slice(0, 2),) values = v[index].squeeze().tolist() if data.size == 2: @@ -11782,7 +11781,6 @@ def _cache_data_elements(self, data, ncvar, attributes): values += v[index].squeeze().tolist() del v - elif size == 1: indices = (0, -1) value = variable[...] @@ -11803,7 +11801,13 @@ def _cache_data_elements(self, data, ncvar, attributes): ] # Create a dictionary of the element values - elements = {index: value for index, value in zip(indices, values)} + # + # Note: some backends might give `None` for uninitialised + # data, when we want `np.ma.masked` in this case. + elements = { + index: (value if value is not None else np.ma.masked) + for index, value in zip(indices, values) + } # Cache the cached data elements for this variable g["cached_data_elements"][ncvar] = elements From 48fd32ac0cb822f79e7cff6e7e8ff64ad1452267 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 29 Oct 2025 14:58:01 +0000 Subject: [PATCH 31/39] dev --- cfdm/read_write/netcdf/netcdfwrite.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 27ac92f4d..07587d623 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -1582,7 +1582,7 @@ def _write_node_coordinates( if ncdim not in ncdim_to_size: size = self.implementation.get_data_size(nodes) logger.info( - f" Writing size {size} node dimension: {ncdim}" + f" Writing size {size} geometry node dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size @@ -2008,7 +2008,8 @@ def _write_part_node_count(self, f, coord, bounds, encodings): ncdim_to_size = g["ncdim_to_size"] if ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} part " f"dimension{ncdim}" + f" Writing size {size} geometry part " + f"dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size @@ -2097,7 +2098,8 @@ def _write_interior_ring(self, f, coord, bounds, encodings): ncdim_to_size = g["ncdim_to_size"] if ncdim not in ncdim_to_size: logger.info( - f" Writing size {size} part " f"dimension{ncdim}" + f" Writing size {size} geometry part " + f"dimension: {ncdim}" ) # pragma: no cover ncdim_to_size[ncdim] = size @@ -2890,7 +2892,9 @@ def _write_netcdf_variable( if g["dry_run"]: return - logger.info(f" Writing {cfvar!r}") # pragma: no cover + logger.info( + f" Writing {cfvar!r} to variable: {ncvar}" + ) # pragma: no cover # Set 'construct_type' if not construct_type: @@ -2945,9 +2949,9 @@ def _write_netcdf_variable( ) logger.debug( - f" chunksizes: {chunksizes!r}\n" - f" contiguous: {contiguous!r}" - f" shards : {shards!r}" + f" chunksizes: {chunksizes!r}, " + f"contiguous: {contiguous!r}, " + f"shards: {shards!r}" ) # pragma: no cover # ------------------------------------------------------------ @@ -3178,7 +3182,7 @@ def _write_netcdf_variable( ) logger.info( - f" to variable: {ncvar}({', '.join(ncdimensions)})" + f" dimensions: ({', '.join(ncdimensions)})" ) # pragma: no cover try: @@ -4408,7 +4412,7 @@ def _write_field_or_domain( # Cell measures if cell_measures: cell_measures = " ".join(cell_measures) - logger.info( + logger.debug( " Writing cell_measures attribute to " f"variable {ncvar}: {cell_measures!r}" ) # pragma: no cover From 2eb45bebf5f06be41ae88c7a8899591438e9f666 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 13 Nov 2025 18:50:15 +0000 Subject: [PATCH 32/39] dev --- Changelog.rst | 2 +- cfdm/functions.py | 4 ++-- cfdm/read_write/netcdf/netcdfread.py | 17 +++++++++++------ cfdm/read_write/netcdf/netcdfwrite.py | 10 +++++----- docs/source/installation.rst | 2 +- setup.py | 2 +- 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index ea4c63280..d0de492d4 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -7,7 +7,7 @@ Version NEXTVERSION (https://github.com/NCAS-CMS/cfdm/issues/354) * Read Zarr v2 and v3 datasets that contain a group hierarchy with `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/355) -* New optional dependency: ``zarr>=3.1.2`` +* New optional dependency: ``zarr>=3.1.3`` * Removed dependency (now optional): ``zarr>=3.0.8`` * Reduce the time taken to import `cfdm` (https://github.com/NCAS-CMS/cfdm/issues/361) diff --git a/cfdm/functions.py b/cfdm/functions.py index d65a494fa..9d86f9556 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -406,7 +406,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 /home/miniconda3/lib/python3.12/site-packages/netCDF4/__init__.py h5netcdf: 1.3.0 /home/miniconda3/lib/python3.12/site-packages/h5netcdf/__init__.py h5py: 3.12.1 /home/miniconda3/lib/python3.12/site-packages/h5py/__init__.py - zarr: 3.0.8 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py + zarr: 3.1.3 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py s3fs: 2024.12.0 /home/miniconda3/lib/python3.12/site-packages/s3fs/__init__.py scipy: 1.15.1 /home/miniconda3/lib/python3.12/site-packages/scipy/__init__.py dask: 2025.5.1 /home/miniconda3/lib/python3.12/site-packages/dask/__init__.py @@ -426,7 +426,7 @@ def environment(display=True, paths=True): netCDF4: 1.7.2 h5netcdf: 1.3.0 h5py: 3.12.1 - zarr: 3.0.8 + zarr: 3.1.3 s3fs: 2024.12.0 scipy: 1.15.1 dask: 2025.5.1 diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index 997e8606c..602205690 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -1472,7 +1472,7 @@ def read( "cfa_write": cfa_write, # -------------------------------------------------------- # Whether or not to store the dataset chunking and - # sharding strategy + # sharding strategies # -------------------------------------------------------- "store_dataset_chunks": bool(store_dataset_chunks), "store_dataset_shards": bool(store_dataset_shards), @@ -8263,7 +8263,7 @@ def _create_Data( ) # Set whether or not to read the data into memory - to_memory = self.read_vars["to_memory"] + to_memory = g["to_memory"] to_memory = "all" in to_memory or construct_type in to_memory data = self.implementation.initialise_Data( @@ -8278,7 +8278,7 @@ def _create_Data( if ncvar is not None: # Store the dataset chunking - if self.read_vars["store_dataset_chunks"]: + if g["store_dataset_chunks"]: # Only store the dataset chunking if 'data' has the # same shape as its netCDF variable. This may not be # the case for variables compressed by convention @@ -8288,7 +8288,7 @@ def _create_Data( self.implementation.nc_set_dataset_chunksizes(data, chunks) # Store the dataset sharding - if self.read_vars["store_dataset_shards"]: + if g["store_dataset_shards"]: # Only store the dataset sharding if 'data' has the # same shape as its netCDF variable. This may not be # the case for variables compressed by convention @@ -12049,6 +12049,10 @@ def _set_quantization(self, parent, ncvar): def _get_dataset_shards(self, ncvar): """Return a netCDF variable's dataset storage shards. + The sharding strategy is defined as the number of dataset + chunks (*not* the number of data array elements) along each + data array axis. + .. versionadded:: (cfdm) NEXTVERSION :Parameters: @@ -12085,8 +12089,9 @@ def _get_dataset_shards(self, ncvar): var = nc[ncvar] shards = var.shards if shards is not None: - # Re-cast 'shards' as the number of chunks (as opposed to - # data elemnents) along each of its dimensions + # 'shards' is currently the number of data array elements + # along each data array axis => re-cast it as the number + # of chunks along each of axis. shards = [s // c for s, c in zip(shards, var.chunks)] return shards, var.shape diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 556655353..1b336a8b8 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2630,10 +2630,10 @@ def _createVariable(self, **kwargs): chunks = shape if shards is not None: - # create the shard shape in the format expected by - # `zarr.create_array`, 'shards' is curerntly + # Create the shard shape in the format expected by + # `zarr.create_array`. 'shards' is currently # defined by how many *chunks* along each - # dimension are in each shard, but `zarr requires + # dimension are in each shard, but `zarr` requires # shards defined by how many *array elements* # along each dimension are in each shard. if chunks == shape: @@ -2658,8 +2658,8 @@ def _createVariable(self, **kwargs): if prod(shards) > 1: # More than one chunk per shard. # - # E.g. shards=(10, 11, 12), chunks=(10, 20, - # 30) => shards=(100, 220, 360) + # E.g. shards=(10, 11, 12) and chunks=(10, + # 20, 30) => shards=(100, 220, 360) shards = [c * n for c, n in zip(chunks, shards)] else: # One chunk per shard. diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 9e10dfb7e..26156d864 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -249,7 +249,7 @@ environments for which these features are not required. .. rubric:: Zarr -* `zarr `_, version 3.1.2 or newer. +* `zarr `_, version 3.1.3 or newer. For reading and writing Zarr datasets. diff --git a/setup.py b/setup.py index aada4f36b..e9d5fd2be 100755 --- a/setup.py +++ b/setup.py @@ -137,7 +137,7 @@ def _get_version(): "pydocstyle", ], "zarr": [ - "zarr>=3.1.2", + "zarr>=3.1.3", ], } From e02c0b7006eb1f606cc71478837f66a68c96aa85 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Jan 2026 09:23:36 +0000 Subject: [PATCH 33/39] Doc string and comment fixes and improvements Co-authored-by: Sadie L. Bartholomew --- cfdm/cfdmimplementation.py | 10 +++++++--- cfdm/data/aggregatedarray.py | 2 +- cfdm/data/netcdfindexer.py | 6 +++--- cfdm/mixin/netcdf.py | 10 +++++----- cfdm/read_write/netcdf/flatten/flatten.py | 16 ++++++++-------- cfdm/read_write/netcdf/netcdfwrite.py | 16 ++++++++-------- cfdm/read_write/netcdf/zarr.py | 4 ++-- cfdm/read_write/read.py | 2 +- cfdm/read_write/write.py | 8 ++++---- 9 files changed, 39 insertions(+), 35 deletions(-) diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 9dd2ab3d2..bf88739c0 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1106,7 +1106,9 @@ def nc_get_dataset_chunksizes(self, data): def nc_get_dataset_shards(self, data): """Get the Zarr dataset sharding strategy for the data. - ..versionadded:: (cfdm) NEXTVERSION + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_set_dataset_shards` :Parameters: @@ -1265,13 +1267,15 @@ def nc_set_hdf5_chunksizes(self, data, chunksizes): def nc_set_dataset_shards(self, data, shards): """Set the Zarr dataset sharding strategy for the data. - ..versionadded:: (cfdm) NEXTVERSION + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `nc_get_dataset_shards` :Parameters: data: `Data` - shards: `None` or `int` or sewunce of `int` + shards: `None` or `int` or sequence of `int` Set the sharding strategy when writing to a Zarr dataset. diff --git a/cfdm/data/aggregatedarray.py b/cfdm/data/aggregatedarray.py index 9405a0c4c..c271bf453 100644 --- a/cfdm/data/aggregatedarray.py +++ b/cfdm/data/aggregatedarray.py @@ -267,7 +267,7 @@ def _parse_fragment_array(self, aggregated_filename, fragment_array): # 'uri' is scalar numpy string type uri = uri.item() except AttributeError: - # E.g. 'uri' is a `str` instance + # E.g. 'uri' is already a `str` instance pass parsed_fragment_array[index] = { diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py index 3312120c8..4f1efd370 100644 --- a/cfdm/data/netcdfindexer.py +++ b/cfdm/data/netcdfindexer.py @@ -401,7 +401,7 @@ def _default_FillValue(self, dtype): if kind == "T": # np.dtypes.StringDType, which stores variable-width - # string data in a UTF-8 encoding, as used by `zarr`) + # string data in a UTF-8 encoding, as used by `zarr` return "" return default_fillvals[dtype.str[1:]] @@ -629,9 +629,9 @@ def _mask(self, data, dtype, attributes, dtype_unsigned_int): if fvalisnan: mask = np.isnan(data) else: - # Must use `np.asanyarray` here, to ensure that that + # Must use `np.asanyarray` here, to ensure that # 'mask' is a never a `bool`, which would make the - # following 'mask.any' call' fail. + # following 'mask.any' call fail. mask = np.asanyarray(data == fval) if mask.any(): diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index 8dae4e7e5..30f8fb9dd 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -5163,7 +5163,7 @@ def nc_dataset_shards(self): :Returns: `None` or `int` or sequence of `int` - The current sharding strateg. One of: + The current sharding strategy. One of: {{sharding options}} @@ -5277,8 +5277,8 @@ def nc_set_dataset_shards(self, shards): if isinstance(shards, Integral): if shards < 1: raise ValueError( - f"'shards' must be None, a positive integer, or a " - f"sequence positive of integers. Got: {shards!r}" + "'shards' must be None, a positive integer, or a " + f"sequence of positive integers. Got: {shards!r}" ) self._set_netcdf("dataset_shards", shards) @@ -5288,8 +5288,8 @@ def nc_set_dataset_shards(self, shards): shards = tuple(shards) except TypeError: raise ValueError( - f"'shards' must be None, a positive integer, or a " - f"sequence positive of integers. Got: {shards!r}" + "'shards' must be None, a positive integer, or a " + f"sequence of positive integers. Got: {shards!r}" ) if len(shards) != len(self.shape): diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py index 72d69ae44..ee51f0a22 100644 --- a/cfdm/read_write/netcdf/flatten/flatten.py +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -148,7 +148,7 @@ def dataset_flatten( .. note:: For a netCDF dataset, for which it is always well-defined in which group a dimension is defined, *group_dimension_search* may only take - the default value of ``'closest_ancestor'`, + the default value of ``'closest_ancestor'``, which applies the behaviour defined by the CF conventions (section 2.7 Groups). @@ -313,7 +313,7 @@ def __init__( input_ds: The dataset to be flattened. Must be an object with - the the same API as `netCDF4.Dataset` or + the same API as `netCDF4.Dataset` or `h5netcdf.File`, or else a `zarr.Group` object. output_ds: `netCDF4.Dataset` @@ -383,7 +383,7 @@ def __init__( # '/forecast': {'element': }, # '/forecast/model': {}} # - # Cuurently this mapping is only required for an input + # Currently this mapping is only required for an input # `zarr.Group` dataset, and is generated by # `_populate_dimension_maps`. self._group_to_dims = {} @@ -396,7 +396,7 @@ def __init__( # 'latitude_longitude': (), # 'forecast/y': ( len(x) + x.__repr__() <==> repr(x) .. versionadded:: (cfdm) 1.12.2.0 diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index 49b8b1a2e..4e1bf3395 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -381,7 +381,7 @@ def _datasets(self): if isdir(x): if is_zarr(x): # This directory is a Zarr dataset, so don't - # look in any subdirectories, which we contain + # look in any subdirectories, which contain # the dataset chunks (but note - it is allowed # for non-chunk subdirectories to exist, but # if they do we're going to ignore them!). diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index f75ec5e37..e04018ef1 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -17,7 +17,7 @@ class write(ReadWrite): **Dataset variable and dimension names** - These names are stored within constructs and are either read a + These names are stored within constructs and are either read from another dataset or may be set manually. They are used when writing a field construct to the dataset. If a name has not been set then one will be constructed (usually based on the standard @@ -569,7 +569,7 @@ class write(ReadWrite): * ``'contiguous'`` - The data will written to the dataset contiguously, + The data will be written to the dataset contiguously, i.e. no chunking. For a Zarr dataset, this is implemented as a single dataset chunk for the entire array. @@ -648,7 +648,7 @@ class write(ReadWrite): The integer number of dataset chunks to be stored in a single shard, favouring an equal number of dataset - chunks along each shard dimenson. + chunks along each shard dimension. *Example:* For two-dimensional data, ``dataset_shards=9`` will @@ -767,7 +767,7 @@ class write(ReadWrite): URIs). If ``'absolute'`` then all fragment dataset names will be written as absolute URIs. If ``'relative'`` then all fragment dataset names will be written as - relative-path URI references URIs, relative to the + relative-path URI references, relative to the location of the aggregation dataset. * ``'strict'``: `bool` From a07f9d693d02e9ba893dc5d09c012263616530ef Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Jan 2026 09:24:51 +0000 Subject: [PATCH 34/39] Typo Co-authored-by: Sadie L. Bartholomew --- docs/source/installation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 26156d864..2b517b39c 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -244,7 +244,7 @@ Optional ^^^^^^^^ Some further dependencies that enable further functionality are -optional. This to facilitate cfdm being installed in restricted +optional. This is to facilitate cfdm being installed in restricted environments for which these features are not required. .. rubric:: Zarr From c7c71b7c322f0dddcfe4e79be9d34b3a539d721b Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Jan 2026 09:49:44 +0000 Subject: [PATCH 35/39] netcdf_flatten deprecation --- Changelog.rst | 2 ++ cfdm/cfdmimplementation.py | 4 ++-- cfdm/functions.py | 28 ++++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index d0de492d4..1bcd39f33 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -7,6 +7,8 @@ Version NEXTVERSION (https://github.com/NCAS-CMS/cfdm/issues/354) * Read Zarr v2 and v3 datasets that contain a group hierarchy with `cfdm.read` (https://github.com/NCAS-CMS/cfdm/issues/355) +* New function `cfdm.dataset_flatten` that replaces the deprecated + `cfdm.netcdf_flatten` (https://github.com/NCAS-CMS/cfdm/issues/355) * New optional dependency: ``zarr>=3.1.3`` * Removed dependency (now optional): ``zarr>=3.0.8`` * Reduce the time taken to import `cfdm` diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index bf88739c0..3e3a407d2 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -1107,7 +1107,7 @@ def nc_get_dataset_shards(self, data): """Get the Zarr dataset sharding strategy for the data. .. versionadded:: (cfdm) NEXTVERSION - + .. seealso:: `nc_set_dataset_shards` :Parameters: @@ -1268,7 +1268,7 @@ def nc_set_dataset_shards(self, data, shards): """Set the Zarr dataset sharding strategy for the data. .. versionadded:: (cfdm) NEXTVERSION - + .. seealso:: `nc_get_dataset_shards` :Parameters: diff --git a/cfdm/functions.py b/cfdm/functions.py index 9d86f9556..760ced48b 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -2436,10 +2436,13 @@ def netcdf_flatten(*args, **kwargs): .. versionadded:: (cfdm) 1.11.2.0 """ - raise DeprecationError( - "Function 'netcdf_flatten' has been renamed 'dataset_flatten' " - "at version NEXTVERSION" - ) + _DEPRECATION_ERROR_FUNCTION( + "netcdf_flatten", + "Use 'cfdm.dataset_flatten' instead, " + "which has a slightly different API.", + version="NEXTVERSION", + removed_at="1.15.0.0", + ) # pragma: no cover def _DEPRECATION_ERROR_KWARGS( @@ -2491,3 +2494,20 @@ def _DEPRECATION_ERROR_METHOD( f"at version {version} and is no longer available{removed_at}. " f"{message}" ) + + +def _DEPRECATION_ERROR_FUNCTION( + func, message="", version=None, removed_at=None +): + """Error handling for deprecated functions. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + if removed_at: + removed_at = f" and will be removed at version {removed_at}" + + raise DeprecationError( + f"Function {func!r} has been deprecated at version {version} and is " + f"no longer available{removed_at}. {message}" + ) From 1e4968d5f3425e700550c5afee35a2572749ccda Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Jan 2026 09:52:34 +0000 Subject: [PATCH 36/39] rename NETCDF4 variable to is_netcdf4 --- cfdm/read_write/netcdf/netcdfwrite.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index fa21a055e..0f9de2a80 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -2589,8 +2589,8 @@ def _createVariable(self, **kwargs): contiguous = kwargs.get("contiguous") - NETCDF4 = g["dataset"].data_model.startswith("NETCDF4") - if NETCDF4 and contiguous: + is_netcdf4 = g["dataset"].data_model.startswith("NETCDF4") + if is_netcdf4 and contiguous: # NETCDF4 contiguous variables can't be compressed kwargs["compression"] = None kwargs["complevel"] = 0 From 0cc7aadf3c5f720c0b90abfbc031d468654b109f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Jan 2026 10:22:37 +0000 Subject: [PATCH 37/39] zarr no append --- cfdm/read_write/netcdf/netcdfwrite.py | 8 +++++++- cfdm/read_write/write.py | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 0f9de2a80..9c21c6993 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -5028,12 +5028,18 @@ def dataset_open(self, dataset_name, mode, fmt, fields): raise RuntimeError(f"{error}: {dataset_name}") case "zarr": + if mode == "a": + raise ValueError( + "Can't write with mode 'a' to a Zarr dataset" + ) + try: import zarr except ModuleNotFoundError as error: error.msg += ( ". Install the 'zarr' package " - "(https://pypi.org/project/zarr) to read Zarr datasets" + "(https://pypi.org/project/zarr) to write Zarr " + "datasets" ) raise diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index e04018ef1..d9ad7c88b 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -196,6 +196,8 @@ class write(ReadWrite): resultant fields using `set_domain_ancillary` and similar methods if required. + .. note: Zarr datasets can not be appended to. + ``'r+'`` Alias for ``'a'``. ======== ================================================= From f66f3a89e1db4ddbea0108a61f5745e4d8ac5b4e Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Jan 2026 10:42:58 +0000 Subject: [PATCH 38/39] Typo Co-authored-by: Sadie L. Bartholomew --- cfdm/mixin/netcdf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cfdm/mixin/netcdf.py b/cfdm/mixin/netcdf.py index 30f8fb9dd..2da9e9ee1 100644 --- a/cfdm/mixin/netcdf.py +++ b/cfdm/mixin/netcdf.py @@ -5295,7 +5295,7 @@ def nc_set_dataset_shards(self, shards): if len(shards) != len(self.shape): raise ValueError( f"When shards is a sequence it must have the same length as " - f"the number of data dimensions ({len(self.shape)}): " + f"the number of data dimensions ({len(self.shape)}). " f"Got: {shards!r} " ) From 769c2a996e7074f1ab2dc633a1d5c6440332e175 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 8 Jan 2026 13:34:19 +0000 Subject: [PATCH 39/39] fix no append with zarr --- cfdm/read_write/netcdf/netcdfwrite.py | 10 +++++----- cfdm/test/test_zarr.py | 10 +++++++++- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/cfdm/read_write/netcdf/netcdfwrite.py b/cfdm/read_write/netcdf/netcdfwrite.py index 9c21c6993..ec3b803d1 100644 --- a/cfdm/read_write/netcdf/netcdfwrite.py +++ b/cfdm/read_write/netcdf/netcdfwrite.py @@ -5028,11 +5028,6 @@ def dataset_open(self, dataset_name, mode, fmt, fields): raise RuntimeError(f"{error}: {dataset_name}") case "zarr": - if mode == "a": - raise ValueError( - "Can't write with mode 'a' to a Zarr dataset" - ) - try: import zarr except ModuleNotFoundError as error: @@ -5560,6 +5555,11 @@ def write( effective_fields = fields if mode == "a": + if fmt == "ZARR3": + raise ValueError( + "Can't write with mode 'a' to a Zarr dataset" + ) + # First read in the fields from the existing dataset: effective_fields = self._NetCDFRead(self.implementation).read( dataset_name, netcdf_backend="netCDF4" diff --git a/cfdm/test/test_zarr.py b/cfdm/test/test_zarr.py index e653a71ec..3f24ea303 100644 --- a/cfdm/test/test_zarr.py +++ b/cfdm/test/test_zarr.py @@ -136,7 +136,7 @@ def test_zarr_read_write_chunks_shards(self): self.assertEqual(z["q"].shards, (4, 6)) def test_zarr_read_write_CFA(self): - """Test CF aggreagtion in Zarr.""" + """Test CF aggregation in Zarr.""" f = self.f0 cfdm.write(f, tmpdir1, fmt="ZARR3") @@ -157,6 +157,14 @@ def test_zarr_read_write_CFA(self): self.assertTrue(z.equals(f)) self.assertTrue(z.equals(n)) + def test_zarr_write_append(self): + """Test in append mode with Zarr.""" + # Check that append mode does not work for Zarr + f = self.f0 + cfdm.write(f, tmpdir1, fmt='ZARR3') + with self.assertRaises(ValueError): + cfdm.write(f, tmpdir1, fmt='ZARR3', mode="a") + def test_zarr_groups_1(self): """Test for the general handling of Zarr hierarchical groups.""" f = cfdm.example_field(1)