From 88e4e3d223f6ba4af9c39b2c58b85abf0046e6cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Tue, 13 Jan 2026 11:59:24 +0100 Subject: [PATCH 1/9] create dir before writing to netcdf --- xdas/core/datacollection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index 4a95a5be..e9feca3d 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -239,6 +239,8 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) + if not os.path.exists(os.path.dirname(fname)): + os.makedirs(os.path.dirname(fname), exist_ok=True) self[key].to_netcdf( fname, mode="a", From dde1be0cee202c3164f9e49f5f056b9379d1f1d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Tue, 13 Jan 2026 15:01:50 +0100 Subject: [PATCH 2/9] Create dirname for dataarray and datacollection, even if dirname is current dir (i.e., empty) --- xdas/core/dataarray.py | 4 ++++ xdas/core/datacollection.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 2dd762a6..7a9ae94b 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -3,6 +3,7 @@ import re import warnings from functools import partial +import os import h5netcdf import h5py @@ -904,6 +905,9 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): attrs = {} if self.attrs is None else self.attrs attrs |= {"coordinate_interpolation": mapping} if mapping else attrs name = "__values__" if self.name is None else self.name + if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): + os.makedirs(os.path.dirname(fname), exist_ok=True) + with h5netcdf.File(fname, mode=mode) as file: if group is not None and group not in file: file.create_group(group) diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index e9feca3d..a2df095f 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -239,7 +239,7 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) - if not os.path.exists(os.path.dirname(fname)): + if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): os.makedirs(os.path.dirname(fname), exist_ok=True) self[key].to_netcdf( fname, From 7175e1aa7ecc68a3bcc5f095d09224bbb736096d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Mon, 16 Feb 2026 18:08:42 +0100 Subject: [PATCH 3/9] Function to split data array into chunks according to available memory limit --- xdas/__init__.py | 1 + xdas/core/routines.py | 75 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/xdas/__init__.py b/xdas/__init__.py index e9a85cb5..b3d1f156 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -25,5 +25,6 @@ open_mfdatacollection, open_mfdatatree, plot_availability, + fit_into_memory, split, ) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index bc9927f3..6639a39e 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -4,6 +4,7 @@ from collections import defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed from glob import glob +import psutil import numpy as np import pandas as pd @@ -741,6 +742,80 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): return DataArray(data, coords, dims, name, attrs) +import sys +from types import ModuleType, FunctionType +from gc import get_referents + +# Custom objects know their class. +# Function objects seem to know way too much, including modules. +# Exclude modules as well. +BLACKLIST = type, ModuleType, FunctionType + +def getsize(obj): + """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714 """ + if isinstance(obj, BLACKLIST): + raise TypeError('getsize() does not take argument of type: '+ str(type(obj))) + seen_ids = set() + size = 0 + objects = [obj] + while objects: + need_referents = [] + for obj in objects: + if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids: + seen_ids.add(id(obj)) + size += sys.getsizeof(obj) + need_referents.append(obj) + objects = get_referents(*need_referents) + return size + +def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinuities", dim="first", tolerance=None): + """ + Check if a data array is too large to fit into a memory limit and split it if it is the case. + + Splitting can either be performed at each discontinuity (along interpolated + coordinates), at a given set of indices (give as a list of int) or in order to get + a given number of equal sized chunks (if a single int is provided). + + Parameters + ---------- + da : DataArray + The data array to split + RAM_limit : float, optional + Ratio of the available memory to not exceed. + indices_or_sections : str, int or list of int, optional + If `indices_or_section` is an integer N, the array will be divided into N + almost equal (can differ by one element if the `dim` size is not a multiple of + N). If `indices_or_section` is a 1-D array of sorted integers, the entries + indicate where the array is split along `dim`. For example, `[2, 3]` would, for + `dim="first"`, result in [da[:2], da[2:3], da[3:]]. If `indices_or_section` is + "discontinuities", the `dim` must be an interpolated coordinate and splitting + will occurs at locations where they are two consecutive tie_indices with only + one index of difference and where the tie_values difference is greater than + `tolerance`. Default to "discontinuities". + dim : str, optional + The dimension along which to split, by default "first" + tolerance : float or timedelta64, optional + If `indices_or_sections="discontinuities"` split will only occur on gaps and + overlaps that are bigger than `tolerance`. Zero tolerance by default. + + Returns + ------- + list of DataArray + The splitted data array. + """ + + available_RAM = psutil.virtual_memory().available # in bytes + n_chunks = 1+int(getsize(da) / (RAM_limit * available_RAM)) + + # n_chunks = max(2, n_chunks_to_fit) # make at least 2 chunks even if fitting into memory? + + # print(f"DataArray size : {getsize(da)/1e9} MB") + # print(f"Available RAM : {available_RAM/1e9} MB") + # print(f"Memory limit : {RAM_limit * available_RAM/1e9} MB") + # print("n_chunks : ", n_chunks) + + return split(da, n_chunks, dim, tolerance) + def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None): """ From 77d61d4fb341b58dbacf6ea3827aa2a75c78bf63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Tue, 17 Feb 2026 15:05:25 +0100 Subject: [PATCH 4/9] add psutil as a dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2ddc83ca..98e4a156 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "obspy", "pandas", "plotly", + "psutil", "scipy", "tqdm", "watchdog", From 661521a97672d08a53a4eb99c35ee3ecea4f5575 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 15:30:07 +0100 Subject: [PATCH 5/9] Formattting. --- xdas/__init__.py | 2 +- xdas/core/dataarray.py | 6 ++++-- xdas/core/datacollection.py | 4 +++- xdas/core/routines.py | 27 ++++++++++++++++++--------- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/xdas/__init__.py b/xdas/__init__.py index dd5944d4..908e0276 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -34,12 +34,12 @@ combine_by_coords, combine_by_field, concatenate, + fit_into_memory, open_dataarray, open_datacollection, open_mfdataarray, open_mfdatacollection, open_mfdatatree, plot_availability, - fit_into_memory, split, ) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 43a5f5e1..e2d5f3df 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -1,7 +1,7 @@ import copy +import os import warnings from functools import partial -import os import h5netcdf import h5py @@ -885,7 +885,9 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) # write data - if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): + if os.path.dirname(fname) is not "" and not os.path.exists( + os.path.dirname(fname) + ): os.makedirs(os.path.dirname(fname), exist_ok=True) with h5netcdf.File(fname, mode=mode) as file: diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index a2df095f..f9adf14a 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -239,7 +239,9 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) - if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): + if os.path.dirname(fname) is not "" and not os.path.exists( + os.path.dirname(fname) + ): os.makedirs(os.path.dirname(fname), exist_ok=True) self[key].to_netcdf( fname, diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 3b3c6d59..ffbfca6a 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -5,11 +5,11 @@ from concurrent.futures import ProcessPoolExecutor, as_completed from glob import glob from itertools import pairwise -import psutil import numpy as np import pandas as pd import plotly.express as px +import psutil import xarray as xr from tqdm import tqdm @@ -743,19 +743,21 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): return DataArray(data, coords, dims, name, attrs) + import sys -from types import ModuleType, FunctionType from gc import get_referents +from types import FunctionType, ModuleType # Custom objects know their class. # Function objects seem to know way too much, including modules. # Exclude modules as well. BLACKLIST = type, ModuleType, FunctionType + def getsize(obj): - """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714 """ + """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714""" if isinstance(obj, BLACKLIST): - raise TypeError('getsize() does not take argument of type: '+ str(type(obj))) + raise TypeError("getsize() does not take argument of type: " + str(type(obj))) seen_ids = set() size = 0 objects = [obj] @@ -769,7 +771,14 @@ def getsize(obj): objects = get_referents(*need_referents) return size -def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinuities", dim="first", tolerance=None): + +def fit_into_memory( + da, + RAM_limit: float = 0.8, + indices_or_sections="discontinuities", + dim="first", + tolerance=None, +): """ Check if a data array is too large to fit into a memory limit and split it if it is the case. @@ -782,7 +791,7 @@ def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinu da : DataArray The data array to split RAM_limit : float, optional - Ratio of the available memory to not exceed. + Ratio of the available memory to not exceed. indices_or_sections : str, int or list of int, optional If `indices_or_section` is an integer N, the array will be divided into N almost equal (can differ by one element if the `dim` size is not a multiple of @@ -805,9 +814,9 @@ def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinu The splitted data array. """ - available_RAM = psutil.virtual_memory().available # in bytes - n_chunks = 1+int(getsize(da) / (RAM_limit * available_RAM)) - + available_RAM = psutil.virtual_memory().available # in bytes + n_chunks = 1 + int(getsize(da) / (RAM_limit * available_RAM)) + # n_chunks = max(2, n_chunks_to_fit) # make at least 2 chunks even if fitting into memory? # print(f"DataArray size : {getsize(da)/1e9} MB") From 1bb11085b40889ecb064d8513133441f51281ab4 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 15:44:05 +0100 Subject: [PATCH 6/9] Remove auto RAM chunsize related things. --- pyproject.toml | 1 - xdas/__init__.py | 1 - xdas/core/routines.py | 84 ------------------------------------------- 3 files changed, 86 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7d5c4ee5..b8585569 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ dependencies = [ "obspy", "pandas", "plotly", - "psutil", "setuptools<82.0.0", "scipy", "tqdm", diff --git a/xdas/__init__.py b/xdas/__init__.py index 908e0276..5f281fc4 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -34,7 +34,6 @@ combine_by_coords, combine_by_field, concatenate, - fit_into_memory, open_dataarray, open_datacollection, open_mfdataarray, diff --git a/xdas/core/routines.py b/xdas/core/routines.py index ffbfca6a..c3a75dcb 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd import plotly.express as px -import psutil import xarray as xr from tqdm import tqdm @@ -744,89 +743,6 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): return DataArray(data, coords, dims, name, attrs) -import sys -from gc import get_referents -from types import FunctionType, ModuleType - -# Custom objects know their class. -# Function objects seem to know way too much, including modules. -# Exclude modules as well. -BLACKLIST = type, ModuleType, FunctionType - - -def getsize(obj): - """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714""" - if isinstance(obj, BLACKLIST): - raise TypeError("getsize() does not take argument of type: " + str(type(obj))) - seen_ids = set() - size = 0 - objects = [obj] - while objects: - need_referents = [] - for obj in objects: - if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids: - seen_ids.add(id(obj)) - size += sys.getsizeof(obj) - need_referents.append(obj) - objects = get_referents(*need_referents) - return size - - -def fit_into_memory( - da, - RAM_limit: float = 0.8, - indices_or_sections="discontinuities", - dim="first", - tolerance=None, -): - """ - Check if a data array is too large to fit into a memory limit and split it if it is the case. - - Splitting can either be performed at each discontinuity (along interpolated - coordinates), at a given set of indices (give as a list of int) or in order to get - a given number of equal sized chunks (if a single int is provided). - - Parameters - ---------- - da : DataArray - The data array to split - RAM_limit : float, optional - Ratio of the available memory to not exceed. - indices_or_sections : str, int or list of int, optional - If `indices_or_section` is an integer N, the array will be divided into N - almost equal (can differ by one element if the `dim` size is not a multiple of - N). If `indices_or_section` is a 1-D array of sorted integers, the entries - indicate where the array is split along `dim`. For example, `[2, 3]` would, for - `dim="first"`, result in [da[:2], da[2:3], da[3:]]. If `indices_or_section` is - "discontinuities", the `dim` must be an interpolated coordinate and splitting - will occurs at locations where they are two consecutive tie_indices with only - one index of difference and where the tie_values difference is greater than - `tolerance`. Default to "discontinuities". - dim : str, optional - The dimension along which to split, by default "first" - tolerance : float or timedelta64, optional - If `indices_or_sections="discontinuities"` split will only occur on gaps and - overlaps that are bigger than `tolerance`. Zero tolerance by default. - - Returns - ------- - list of DataArray - The splitted data array. - """ - - available_RAM = psutil.virtual_memory().available # in bytes - n_chunks = 1 + int(getsize(da) / (RAM_limit * available_RAM)) - - # n_chunks = max(2, n_chunks_to_fit) # make at least 2 chunks even if fitting into memory? - - # print(f"DataArray size : {getsize(da)/1e9} MB") - # print(f"Available RAM : {available_RAM/1e9} MB") - # print(f"Memory limit : {RAM_limit * available_RAM/1e9} MB") - # print("n_chunks : ", n_chunks) - - return split(da, n_chunks, dim, tolerance) - - def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None): """ Split a data array along a dimension. From cb0a93b4a59ea9a5b894fbd85e8ae6587ecc3121 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 16:05:27 +0100 Subject: [PATCH 7/9] Add `create_dirs` kwarg. --- xdas/core/dataarray.py | 23 +++++++++++++++++------ xdas/core/datacollection.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index e2d5f3df..1fa2cd94 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -826,7 +826,15 @@ def from_stream(cls, st, dims=("channel", "time")): } return cls(data, {dims[0]: channel, dims[1]: time}) - def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): + def to_netcdf( + self, + fname, + mode="w", + group=None, + virtual=None, + encoding=None, + create_dirs=False, + ): """ Write DataArray contents to a netCDF file. @@ -850,6 +858,8 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): the `h5netcdf` engine to write the data. If you want to use a specific plugin for compression, you can use the `hdf5plugin` package. For example, to use the ZFP compression, you can use the `hdf5plugin.Zfp` class. + create_dirs : bool, optional + Whether to create parent directories if they do not exist. Default is False. Examples -------- @@ -884,12 +894,13 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): for coord in self.coords.values(): dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) - # write data - if os.path.dirname(fname) is not "" and not os.path.exists( - os.path.dirname(fname) - ): - os.makedirs(os.path.dirname(fname), exist_ok=True) + # create parent directories if needed + if create_dirs: + dirname = os.path.dirname(fname) + if dirname: + os.makedirs(dirname, exist_ok=True) + # write data with h5netcdf.File(fname, mode=mode) as file: # group if group is not None and group not in file: diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index f9adf14a..aafef713 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -231,7 +231,15 @@ def fields(self): ) return uniquifiy(out) - def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): + def to_netcdf( + self, + fname, + mode="w", + group=None, + virtual=None, + encoding=None, + create_dirs=False, + ): if mode == "w" and group is None and os.path.exists(fname): os.remove(fname) for key in self: @@ -239,10 +247,10 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) - if os.path.dirname(fname) is not "" and not os.path.exists( - os.path.dirname(fname) - ): - os.makedirs(os.path.dirname(fname), exist_ok=True) + if create_dirs: + dirname = os.path.dirname(fname) + if dirname: + os.makedirs(dirname, exist_ok=True) self[key].to_netcdf( fname, mode="a", @@ -445,9 +453,22 @@ def to_mapping(self): def from_mapping(cls, data): return cls(data.values(), data.name) - def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): + def to_netcdf( + self, + fname, + mode="w", + group=None, + virtual=None, + encoding=None, + create_dirs=False, + ): self.to_mapping().to_netcdf( - fname, mode=mode, group=group, virtual=virtual, encoding=encoding + fname, + mode=mode, + group=group, + virtual=virtual, + encoding=encoding, + create_dirs=create_dirs, ) @classmethod From 0a1616b63915b07defb4624c036e0957eb581388 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 16:16:36 +0100 Subject: [PATCH 8/9] Add test to `create_dirs`. --- tests/test_dataarray.py | 10 ++++++++++ tests/test_datacollection.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/tests/test_dataarray.py b/tests/test_dataarray.py index 4a251f7e..acdabb35 100644 --- a/tests/test_dataarray.py +++ b/tests/test_dataarray.py @@ -458,6 +458,16 @@ def test_io_attrs(self): assert result.attrs == attrs assert result.equals(da) + def test_io_create_dirs(self): + da = xd.DataArray(np.arange(3)) + with TemporaryDirectory() as dirpath: + path = os.path.join(dirpath, "subdir", "tmp.nc") + with pytest.raises(FileNotFoundError, match="No such file or directory"): + da.to_netcdf(path) + da.to_netcdf(path, create_dirs=True) + result = xd.DataArray.from_netcdf(path) + assert result.equals(da) + def test_ufunc(self): da = wavelet_wavefronts() result = np.add(da, 1) diff --git a/tests/test_datacollection.py b/tests/test_datacollection.py index 7ebd2c0f..d70cf22c 100644 --- a/tests/test_datacollection.py +++ b/tests/test_datacollection.py @@ -68,6 +68,23 @@ def test_io(self): result = xd.open_datacollection(path) assert result.equals(dc) + def test_io_create_dirs(self): + da = wavelet_wavefronts() + dc = xd.DataCollection( + { + "das1": da, + "das2": da, + }, + "instrument", + ) + with TemporaryDirectory() as dirpath: + path = os.path.join(dirpath, "subdir", "tmp.nc") + with pytest.raises(FileNotFoundError, match="No such file or directory"): + dc.to_netcdf(path) + dc.to_netcdf(path, create_dirs=True) + result = xd.DataCollection.from_netcdf(path) + assert result.equals(dc) + def test_depth_counter(self): da = wavelet_wavefronts() da.name = "da" From 4547c6a7be9e064b894a85cb4b8f2c8873282a17 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 16:18:49 +0100 Subject: [PATCH 9/9] Add release note for `create_dirs`. --- docs/release-notes.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index aa8001a3..9234b933 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,8 +1,10 @@ # Release notes ## 0.2.5 -- Fix numpy 2.4 and obspy 1.4.2 incompatibilities (@atrabatto). - Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). +- Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). +- Fix numpy 2.4 and obspy 1.4.2 incompatibilities (@atrabatto). + ## 0.2.4 - Add StreamWriter to write long time series to miniSEED (@marbail).