From 203588075c611180bc11c033fc3f65c0596464fe Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 6 Aug 2025 15:43:26 +0200 Subject: [PATCH 001/104] wip: first draft before testing. --- xdas/core/coordinates.py | 453 ++++++++++++++++++++++++++++++++++----- 1 file changed, 399 insertions(+), 54 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index cf638e7c..20735333 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -345,6 +345,58 @@ def to_dict(self): def from_dict(cls, dct): return cls(**dct) + def format_index(self, idx, bounds="raise"): + idx = np.asarray(idx) + if not np.issubdtype(idx.dtype, np.integer): + raise IndexError("only integer are valid index") + idx = idx + (idx < 0) * len(self) + if bounds == "raise": + if np.any(idx < 0) or np.any(idx >= len(self)): + raise IndexError("index is out of bounds") + elif bounds == "clip": + idx = np.clip(idx, 0, len(self)) + return idx + + def format_index_slice(self, slc): + start = slc.start + stop = slc.stop + step = slc.step + if start is None: + start = 0 + if stop is None: + stop = len(self) + if step is None: + step = 1 + start = self.format_index(start, bounds="clip") + stop = self.format_index(stop, bounds="clip") + return slice(start, stop, step) + + def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + if start is not None: + try: + start_index = self.get_indexer(start, method="bfill") + except KeyError: + start_index = len(self) + else: + start_index = None + if stop is not None: + try: + end_index = self.get_indexer(stop, method="ffill") + stop_index = end_index + 1 + except KeyError: + stop_index = 0 + else: + stop_index = None + if step is not None: + raise NotImplementedError("cannot use step yet") + if ( + (not endpoint) + and (stop is not None) + and (self[stop_index - 1].values == stop) + ): + stop_index -= 1 + return slice(start_index, stop_index) + class ScalarCoordinate(Coordinate): def __new__(cls, *args, **kwargs): @@ -705,18 +757,6 @@ def get_value(self, index): index = self.format_index(index) return forward(index, self.tie_indices, self.tie_values) - def format_index(self, idx, bounds="raise"): - idx = np.asarray(idx) - if not np.issubdtype(idx.dtype, np.integer): - raise IndexError("only integer are valid index") - idx = idx + (idx < 0) * len(self) - if bounds == "raise": - if np.any(idx < 0) or np.any(idx >= len(self)): - raise IndexError("index is out of bounds") - elif bounds == "clip": - idx = np.clip(idx, 0, len(self)) - return idx - def slice_index(self, index_slice): index_slice = self.format_index_slice(index_slice) start_index, stop_index, step_index = ( @@ -725,11 +765,13 @@ def slice_index(self, index_slice): index_slice.step, ) if stop_index - start_index <= 0: - return self.__class__(dict(tie_indices=[], tie_values=[])) + return self.__class__(dict(tie_indices=[], tie_values=[], dim=self.dim)) elif (stop_index - start_index) <= step_index: tie_indices = [0] tie_values = [self.get_value(start_index)] - return self.__class__(dict(tie_indices=tie_indices, tie_values=tie_values)) + return self.__class__( + dict(tie_indices=tie_indices, tie_values=tie_values, dim=self.dim) + ) else: end_index = stop_index - 1 start_value = self.get_value(start_index) @@ -752,20 +794,6 @@ def slice_index(self, index_slice): coord = coord.decimate(step_index) return coord - def format_index_slice(self, slc): - start = slc.start - stop = slc.stop - step = slc.step - if start is None: - start = 0 - if stop is None: - stop = len(self) - if step is None: - step = 1 - start = self.format_index(start, bounds="clip") - stop = self.format_index(stop, bounds="clip") - return slice(start, stop, step) - def get_indexer(self, value, method=None): if isinstance(value, str): value = np.datetime64(value) @@ -786,32 +814,6 @@ def get_indexer(self, value, method=None): raise e return indexer - def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): - if start is not None: - try: - start_index = self.get_indexer(start, method="bfill") - except KeyError: - start_index = len(self) - else: - start_index = None - if stop is not None: - try: - end_index = self.get_indexer(stop, method="ffill") - stop_index = end_index + 1 - except KeyError: - stop_index = 0 - else: - stop_index = None - if step is not None: - raise NotImplementedError("cannot use step yet") - if ( - (not endpoint) - and (stop is not None) - and (self[stop_index - 1].values == stop) - ): - stop_index -= 1 - return slice(start_index, stop_index) - def append(self, other): if not isinstance(other, self.__class__): raise TypeError(f"cannot append {type(other)} to {self.__class__}") @@ -959,6 +961,349 @@ def to_dict(self): return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} +class SampledCoordinate(Coordinate): + """ + A coordinate that is sampled at regular intervals. + + Parameters + ---------- + data : dict-like + The data of the coordinate. + dim : str, optional + The dimension name of the coordinate, by default None. + dtype : str or numpy.dtype, optional + The data type of the coordinate, by default None. + """ + + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + if data is None: + data = {"tie_values": [], "tie_samples": [], "sampling_interval": None} + data, dim = parse(data, dim) + if not self.__class__.isvalid(data): + raise TypeError("`data` must be dict-like") + if not set(data) == {"tie_values", "tie_samples", "sampling_interval"}: + raise ValueError( + "keys `tie_values`, `tie_samples`, and `sampling_interval` must be provided" + ) + tie_values = np.asarray(data["tie_values"], dtype=dtype) + tie_samples = np.asarray(data["tie_samples"]) + sampling_interval = np.asarray(data["sampling_interval"]) + if not tie_values.ndim == 1: + raise ValueError("`tie_values` must be 1D") + if not tie_samples.ndim == 1: + raise ValueError("`tie_samples` must be 1D") + if not len(tie_values) == len(tie_samples): + raise ValueError("`tie_values` and `tie_samples` must have the same length") + if not ( + np.issubdtype(tie_values.dtype, np.number) + or np.issubdtype(tie_values.dtype, np.datetime64) + ): + raise ValueError("`tie_values` must have either numeric or datetime dtype") + if not self.empty: + if not np.issubdtype(tie_samples.dtype, np.integer): + raise ValueError("`tie_samples` must be integer-like") + if not np.all(tie_samples > 0): + raise ValueError("`tie_samples` must be positive integers") + if not np.isscalar(sampling_interval): + raise ValueError("`sampling_interval` must be a scalar value") + if np.issubdtype(sampling_interval.dtype, np.datetime64): + sampling_interval = sampling_interval.astype("timedelta64[ns]") + else: + sampling_interval = np.asarray(sampling_interval).astype(dtype) + tie_samples = tie_samples.astype(int) + self.data = dict( + tie_values=tie_values, + tie_samples=tie_samples, + sampling_interval=sampling_interval, + ) + self.dim = dim + + @staticmethod + def isvalid(data): + match data: + case { + "tie_values": _, + "tie_samples": _, + "sampling_interval": _, + }: + return True + case _: + return False + + def __len__(self): + if self.empty: + return 0 + else: + return sum(self.tie_samples) + + def __repr__(self): + if self.empty: + return "empty coordinate" + elif len(self) == 1: + return f"{self.tie_values[0]}" + else: + if np.issubdtype(self.dtype, np.floating): + return f"{self.start:.3f} to {self.end:.3f}" + elif np.issubdtype(self.dtype, np.datetime64): + self.start = format_datetime(self.start) + self.end = format_datetime(self.end) + return f"{self.start} to {self.end}" + else: + return f"{self.start} to {self.end}" + + def __getitem__(self, item): + if isinstance(item, slice): + return self.slice_index(item) + elif np.isscalar(item): + return ScalarCoordinate(self.get_value(item), None) + else: + return DenseCoordinate(self.get_value(item), self.dim) + + def __add__(self, other): + return self.__class__( + { + "tie_values": self.tie_values + other, + "tie_samples": self.tie_samples, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def __sub__(self, other): + return self.__class__( + { + "tie_values": self.tie_values - other, + "tie_samples": self.tie_samples, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def __array__(self, dtype=None): + out = self.values + if dtype is not None: + out = out.__array__(dtype) + return out + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + raise NotImplementedError + + def __array_function__(self, func, types, args, kwargs): + raise NotImplementedError + + @property + def tie_values(self): + return self.data["tie_values"] + + @property + def tie_samples(self): + return self.data["tie_samples"] + + @property + def sampling_interval(self): + return self.data["sampling_interval"] + + @property + def tie_indices(self): + return np.concatenate(([0], np.cumsum(self.tie_samples[:-1]))) + + @property + def empty(self): + return self.tie_values.shape == (0,) + + @property + def dtype(self): + return self.tie_values.dtype + + @property + def ndim(self): + return self.tie_values.ndim + + @property + def shape(self): + return (len(self),) + + @property + def indices(self): + if self.empty: + return np.array([], dtype="int") + else: + return np.arange(len(self)) + + @property + def values(self): + if self.empty: + return np.array([], dtype=self.dtype) + else: + return self.get_value(self.indices) + + @property + def start(self): + return self.tie_values[0] + + @property + def end(self): + return self.tie_values[-1] + self.sampling_interval * self.tie_samples[-1] + + def equals(self, other): + return ( + np.array_equal(self.tie_values, other.tie_values) + and np.array_equal(self.tie_samples, other.tie_samples) + and self.sampling_interval == other.sampling_interval + and self.dim == other.dim + and self.dtype == other.dtype + ) + + def get_value(self, index): + index = self.format_index(index) + if np.any(index < 0) or np.any(index >= len(self)): + raise IndexError("index is out of bounds") + reference = np.searchsorted(self.tie_indices, index) + return self.tie_values[reference] + ( + (index - self.tie_indices[reference]) * self.sampling_interval + ) + + def slice_index(self, index_slice): + index_slice = self.format_index_slice(index_slice) + start_index, stop_index, step_index = ( + index_slice.start, + index_slice.stop, + index_slice.step, + ) + if stop_index - start_index <= 0: + return self.__class__( + dict( + tie_values=[], + tie_samples=[], + sampling_interval=self.sampling_interval, + ), + self.dim, + ) + elif (stop_index - start_index) <= step_index: + tie_values = [self.get_value(start_index)] + tie_samples = [stop_index - start_index] + return self.__class__( + dict( + tie_values=tie_values, + tie_samples=tie_samples, + sampling_interval=self.sampling_interval, + ), + self.dim, + ) + else: + # keep tie values, number of samples and related tie indices contained in the slice + mask = (start_index < self.tie_indices) & (self.tie_indices <= stop_index) + tie_values = self.tie_values[mask] + tie_samples = self.tie_samples[mask] + tie_indices = self.tie_indices[mask] + + # insert the missing start value + start_value = self.get_value(start_index) + tie_values = np.concatenate([[start_value], self.tie_values[mask]]) + + # insert the missing start number of samples and adjust the end one + tie_samples = np.concatenate( + [[start_index - tie_indices[0]], tie_samples[mask]] + ) + tie_samples[-1] = stop_index - tie_indices[-1] + + # repack data and decimate if needed + data = { + "tie_values": tie_values, + "tie_samples": tie_samples, + "sampling_interval": self.sampling_interval, + } + coord = self.__class__(data, self.dim) + if step_index != 1: + coord = coord.decimate(step_index) + return coord + + def get_indexer(self, value, method=None): + if isinstance(value, str): + value = np.datetime64(value) + else: + value = np.asarray(value) + # Check that value lies within the coordinate value range (vectorized) + if np.any(value < self.start) or np.any(value > self.end): + raise KeyError("index not found") + if not is_strictly_increasing(self.tie_values): + raise ValueError("tie_values must be strictly increasing") + reference = np.searchsorted(self.tie_values, value) + offset = (value - self.tie_values[reference]) / self.sampling_interval + if method == "nearest": + offset = np.round(offset).astype(int) + elif method == "ffill": + offset = np.floor(offset).astype(int) + elif method == "bfill": + offset = np.ceil(offset).astype(int) + else: + raise ValueError("method must be one of 'nearest', 'ffill', or 'bfill'") + return self.tie_indices[reference] + offset + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + if self.empty: + return other + if other.empty: + return self + if not self.dtype == other.dtype: + raise ValueError("cannot append coordinate with different dtype") + if not self.sampling_interval == other.sampling_interval: + raise ValueError( + "cannot append coordinate with different sampling intervals" + ) + tie_values = np.concatenate([self.tie_values, other.tie_values]) + tie_samples = np.concatenate([self.tie_samples, other.tie_samples + len(self)]) + return self.__class__( + { + "tie_values": tie_values, + "tie_samples": tie_samples, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def decimate(self, q): + raise NotImplementedError("decimation is not implemented for SampledCoordinate") + + def simplify(self, tolerance=None): + raise NotImplementedError( + "simplification is not implemented for SampledCoordinate" + ) + + def get_discontinuities(self): + raise NotImplementedError( + "get_discontinuities is not implemented for SampledCoordinate" + ) + + def get_availabilities(self): + raise NotImplementedError( + "get_availabilities is not implemented for SampledCoordinate" + ) + + @classmethod + def from_array(cls, arr, dim=None, sampling_interval=None): + raise NotImplementedError("from_array is not implemented for SampledCoordinate") + + def to_dict(self): + tie_values = self.data["tie_values"] + tie_samples = self.data["tie_samples"] + if np.issubdtype(tie_values.dtype, np.datetime64): + tie_values = tie_values.astype(str) + data = { + "tie_values": tie_values.tolist(), + "tie_samples": tie_samples.tolist(), + "sampling_interval": self.sampling_interval, + } + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + + def parse(data, dim=None): if isinstance(data, tuple): if dim is None: From f9e456f14461581f9d6873587500a86b4bbd49a5 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 6 Aug 2025 15:48:22 +0200 Subject: [PATCH 002/104] Fix wrong edit in InterpCoordinate. --- xdas/core/coordinates.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 20735333..1f712024 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -765,12 +765,12 @@ def slice_index(self, index_slice): index_slice.step, ) if stop_index - start_index <= 0: - return self.__class__(dict(tie_indices=[], tie_values=[], dim=self.dim)) + return self.__class__(dict(tie_indices=[], tie_values=[]), dim=self.dim) elif (stop_index - start_index) <= step_index: tie_indices = [0] tie_values = [self.get_value(start_index)] return self.__class__( - dict(tie_indices=tie_indices, tie_values=tie_values, dim=self.dim) + dict(tie_indices=tie_indices, tie_values=tie_values), dim=self.dim ) else: end_index = stop_index - 1 From f8fb06badcaf820a8094c773866c23445163ce34 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 2 Dec 2025 15:19:06 +0100 Subject: [PATCH 003/104] Refactor import statements to use alias for xdas module --- tests/test_signal.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_signal.py b/tests/test_signal.py index e14c41a1..3cc44993 100644 --- a/tests/test_signal.py +++ b/tests/test_signal.py @@ -5,7 +5,7 @@ import scipy.signal as sp import xarray as xr -import xdas +import xdas as xd import xdas.signal as xs from xdas.synthetics import wavelet_wavefronts @@ -15,7 +15,7 @@ def test_get_sample_spacing(self): shape = (6000, 1000) resolution = (np.timedelta64(8, "ms"), 5.0) starttime = np.datetime64("2023-01-01T00:00:00") - da = xdas.DataArray( + da = xd.DataArray( data=np.random.randn(*shape).astype("float32"), coords={ "time": { @@ -39,7 +39,7 @@ def test_deterend(self): d = 5.0 s = d * np.arange(n) da = xr.DataArray(np.arange(n), {"time": s}) - da = xdas.DataArray.from_xarray(da) + da = xd.DataArray.from_xarray(da) da = xs.detrend(da) assert np.allclose(da.values, np.zeros(n)) @@ -48,7 +48,7 @@ def test_differentiate(self): d = 5.0 s = (d / 2) + d * np.arange(n) da = xr.DataArray(np.ones(n), {"distance": s}) - da = xdas.DataArray.from_xarray(da) + da = xd.DataArray.from_xarray(da) da = xs.differentiate(da, midpoints=True) assert np.allclose(da.values, np.zeros(n - 1)) @@ -57,7 +57,7 @@ def test_integrate(self): d = 5.0 s = (d / 2) + d * np.arange(n) da = xr.DataArray(np.ones(n), {"distance": s}) - da = xdas.DataArray.from_xarray(da) + da = xd.DataArray.from_xarray(da) da = xs.integrate(da, midpoints=True) assert np.allclose(da.values, da["distance"].values) @@ -71,7 +71,7 @@ def test_segment_mean_removal(self): da = xr.DataArray(data, {"distance": s}) da.loc[{"distance": slice(limits[0], limits[1])}] = 1.0 da.loc[{"distance": slice(limits[1], limits[2])}] = 2.0 - da = xdas.DataArray.from_xarray(da) + da = xd.DataArray.from_xarray(da) da = xs.segment_mean_removal(da, limits) assert np.allclose(da.values, 0) @@ -82,7 +82,7 @@ def test_sliding_window_removal(self): s = np.linspace(0, 1000, n) data = np.ones(n) da = xr.DataArray(data, {"distance": s}) - da = xdas.DataArray.from_xarray(da) + da = xd.DataArray.from_xarray(da) da = xs.sliding_mean_removal(da, 0.1 * n * d) assert np.allclose(da.values, 0) @@ -136,7 +136,7 @@ def test_sosfiltfilt(self): def test_filter(self): da = wavelet_wavefronts() axis = da.get_axis_num("time") - fs = 1 / xdas.get_sampling_interval(da, "time") + fs = 1 / xd.get_sampling_interval(da, "time") sos = sp.butter( 4, [5, 10], @@ -172,12 +172,12 @@ def test_filter(self): def test_decimate_virtual_stack(self): da = wavelet_wavefronts() expected = xs.decimate(da, 5, dim="time") - chunks = xdas.split(da, 5, "time") + chunks = xd.split(da, 5, "time") with tempfile.TemporaryDirectory() as tmpdirname: for i, chunk in enumerate(chunks): chunk_path = os.path.join(tmpdirname, f"chunk_{i}.nc") chunk.to_netcdf(chunk_path) - da_virtual = xdas.open_mfdataarray(os.path.join(tmpdirname, "chunk_*.nc")) + da_virtual = xd.open_mfdataarray(os.path.join(tmpdirname, "chunk_*.nc")) result = xs.decimate(da_virtual, 5, dim="time") assert result.equals(expected) @@ -186,7 +186,7 @@ class TestSTFT: def test_compare_with_scipy(self): starttime = np.datetime64("2023-01-01T00:00:00") endtime = starttime + 9999 * np.timedelta64(10, "ms") - da = xdas.DataArray( + da = xd.DataArray( data=np.random.rand(10000, 11), coords={ "time": {"tie_indices": [0, 9999], "tie_values": [starttime, endtime]}, @@ -241,7 +241,7 @@ def test_retrieve_frequency_peak(self): amp = 2 * np.sqrt(2) time = np.arange(N) / float(fs) data = amp * np.sin(2 * np.pi * fc * time) - da = xdas.DataArray( + da = xd.DataArray( data=data, coords={"time": time}, ) @@ -254,7 +254,7 @@ def test_retrieve_frequency_peak(self): def test_parrallel(self): starttime = np.datetime64("2023-01-01T00:00:00") endtime = starttime + 9999 * np.timedelta64(10, "ms") - da = xdas.DataArray( + da = xd.DataArray( data=np.random.rand(10000, 11), coords={ "time": {"tie_indices": [0, 9999], "tie_values": [starttime, endtime]}, @@ -282,7 +282,7 @@ def test_parrallel(self): def test_last_dimension_with_non_dimensional_coordinates(self): starttime = np.datetime64("2023-01-01T00:00:00") endtime = starttime + 99 * np.timedelta64(10, "ms") - da = xdas.DataArray( + da = xd.DataArray( data=np.random.rand(100, 1001), coords={ "time": {"tie_indices": [0, 99], "tie_values": [starttime, endtime]}, From cb87e9e4d4bc81ff447c64a27f075044a37d9665 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 2 Dec 2025 16:11:47 +0100 Subject: [PATCH 004/104] Add tapered_selection function and corresponding tests --- tests/test_picking.py | 57 ++++++++++++++++++ xdas/picking.py | 130 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 187 insertions(+) create mode 100644 tests/test_picking.py create mode 100644 xdas/picking.py diff --git a/tests/test_picking.py b/tests/test_picking.py new file mode 100644 index 00000000..a896830c --- /dev/null +++ b/tests/test_picking.py @@ -0,0 +1,57 @@ +import numpy as np + +import xdas as xd +from xdas.picking import tapered_selection + + +class TestTaperedSelection: + def test_basic_functionality(self): + da = xd.DataArray( + data=np.arange(5 * 10).reshape(5, 10).astype(float), + coords={ + "distance": { + "tie_indices": [0, 4], + "tie_values": [0.0, 400.0], + }, + "time": { + "tie_indices": [0, 9], + "tie_values": [ + np.datetime64("2023-01-01T00:00:00"), + np.datetime64("2023-01-01T00:00:09"), + ], + }, + }, + ) + + start = xd.DataArray( + data=np.array( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:03")] * 2 + + [np.datetime64("NaT")] * 2 + ), + coords={"distance": da["distance"]}, + ) + end = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:07")] * 2 + + [np.datetime64("NaT")] * 2 + ) + window = [0.5, 1.0, 0.5] + + result = tapered_selection(da, start, end, window, dim="time") + + expected = xd.DataArray( + data=[ + [6.5, 14.0, 15.0, 16.0, 8.5], + [11.5, 24.0, 25.0, 26.0, 13.5], + ], + coords={ + "distance": [100.0, 200.0], + "time": { + "tie_indices": [0, 4], + "tie_values": [0.0, 4.0], + }, + }, + ) + + assert result.equals(expected) diff --git a/xdas/picking.py b/xdas/picking.py new file mode 100644 index 00000000..0974a7b1 --- /dev/null +++ b/xdas/picking.py @@ -0,0 +1,130 @@ +import numpy as np +from numba import njit, prange +from scipy.fft import next_fast_len + +import xdas as xd + + +def tapered_selection(da, start, end, window, size=None, dim="last"): + """ + Selects and tapers a DataArray based on `start` and `end` values. + + Coordinates with NaN or NaT `start` or `end` values are ignored. If no `size` is + provided, the length of the resulting data is determined by the next fast length + (for FFT) of the maximum distance between the start and end values. The tapering + window is split in half and applied to the start and end of the selected data. The + window size must be smaller than the smallest selected data window. + + Parameters + ---------- + da : DataArray + Input data array to select and taper. Must be 2D and have `dim` as one of its + dimensions. + start : array-like + Start values along the other dimension than `dim` (must be 1D and have the + same size) NaN or NaT values indicate coordinates to ignored. + end : array-like + End values along the other dimension than `dim` (must be 1D and have the + same size) NaN or NaT values indicate coordinates to ignored. + size : int, optional + Size of the output data along `dim`. If None, it is determined by the next + fast length of the maximum selected window. + dim : str, optional + Dimension along which to perform the selection and tapering. Default is'last'. + window : array-like + Tapering window to apply to the selected data. + + Returns + ------- + DataArray + A DataArray containing the selected and tapered data with sizes {other_dim: N, + `dim`: `size`}, where N is the number of valid start/end pairs. The `dim` + dimension becomes the last dimension and its coordinates run from 0 to + d * (size - 1), where d is the sampling interval along `dim`. + + """ + # convert to numpy + data = np.asarray(da.transpose(..., dim)) + start = np.asarray(start) + end = np.asarray(end) + window = np.asarray(window) + + # check shapes + if not data.shape[:-1] == start.shape == end.shape: + raise ValueError("shape mismatch between `da`, `start`, and `end`") + + # make window even-sized (central value should be 1 so can be skipped) + if window.size % 2 != 0: + half_size = window.size // 2 + window = np.concatenate((window[:half_size], window[-half_size:])) + + # select valid start/end + mask = np.isfinite(start) & np.isfinite(end) + selection = np.nonzero(mask)[0] + + # get selection indices + startindex = da[dim].get_indexer(start[selection], method="bfill") + endindex = da[dim].get_indexer(end[selection], method="ffill") + stopindex = endindex + 1 + + # determine output size + if size is None: + size = next_fast_len(max(stopindex - startindex)) + + # check window size + if min(stopindex - startindex) < window.size: + raise ValueError("some selected windows are smaller than the window size") + + # perform tapered selection + data = _tapered_selection( + data, + selection, + startindex, + stopindex, + size, + window, + ) + + # update output coords + coords = {} + for name in da.coords: + if da[name].dim == dim: + if name == dim: + coords[name] = { + "tie_indices": [0, size - 1], + "tie_values": [0.0, (size - 1) * xd.get_sampling_interval(da, dim)], + } + else: + pass # skip non-dimensional coords for `dim` + else: + coords[name] = da[name][selection] + + # return output DataArray + return xd.DataArray(data, coords=coords) + + +@njit(parallel=True) +def _tapered_selection(data, sel, start, stop, size, window): + out = np.zeros((sel.size, size), dtype=data.dtype) + w = window.size // 2 + for i in prange(sel.size): + j = 0 + n = stop[i] - start[i] + p = sel[i] + q = start[i] + k = 0 + while j < w: + out[i, j] = data[p, q] * window[k] + j += 1 + q += 1 + k += 1 + while j < n - w: + out[i, j] = data[p, q] + j += 1 + q += 1 + while j < n: + out[i, j] = data[p, q] * window[k] + j += 1 + q += 1 + k += 1 + return out From abfbc7c41e7b3bed6db36f684eb9a1cb34c20966 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 2 Dec 2025 16:38:39 +0100 Subject: [PATCH 005/104] test win_size_error and other_dim for tapered_selection --- tests/test_picking.py | 60 +++++++++++++++++++++++++++++++++++++++++-- xdas/picking.py | 18 ++++++++----- 2 files changed, 69 insertions(+), 9 deletions(-) diff --git a/tests/test_picking.py b/tests/test_picking.py index a896830c..81052cfd 100644 --- a/tests/test_picking.py +++ b/tests/test_picking.py @@ -3,10 +3,12 @@ import xdas as xd from xdas.picking import tapered_selection +import pytest + class TestTaperedSelection: - def test_basic_functionality(self): - da = xd.DataArray( + def generate(self): + return xd.DataArray( data=np.arange(5 * 10).reshape(5, 10).astype(float), coords={ "distance": { @@ -23,6 +25,9 @@ def test_basic_functionality(self): }, ) + def test_basic_functionality(self): + da = self.generate() + start = xd.DataArray( data=np.array( [np.datetime64("NaT")] @@ -55,3 +60,54 @@ def test_basic_functionality(self): ) assert result.equals(expected) + + def test_window_size_error(self): + da = self.generate() + + start = xd.DataArray( + data=np.array( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:08")] * 2 + + [np.datetime64("NaT")] * 2 + ), + coords={"distance": da["distance"]}, + ) + end = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:09")] * 2 + + [np.datetime64("NaT")] * 2 + ) + window = [0.5, 1.0, 0.5] + + with pytest.raises( + ValueError, match="some selected windows are smaller than the window size" + ): + tapered_selection(da, start, end, window, dim="time") + + def test_other_dim(self): + da = self.generate() + + start = [np.nan] + [100.0] * 2 + [np.nan] * 7 + end = [np.nan] + [300.0] * 2 + [np.nan] * 7 + window = [0.5, 1.0, 0.5] + + result = tapered_selection(da, start, end, window, dim="distance") + + expected = xd.DataArray( + data=[ + [5.5, 21.0, 15.5], + [6.0, 22.0, 16.0], + ], + coords={ + "time": [ + np.datetime64("2023-01-01T00:00:01"), + np.datetime64("2023-01-01T00:00:02"), + ], + "distance": { + "tie_indices": [0, 2], + "tie_values": [0.0, 200.0], + }, + }, + ) + + assert result.equals(expected) diff --git a/xdas/picking.py b/xdas/picking.py index 0974a7b1..1abfec8e 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -43,8 +43,11 @@ def tapered_selection(da, start, end, window, size=None, dim="last"): d * (size - 1), where d is the sampling interval along `dim`. """ + # transpose so `dim` is last + da = da.transpose(..., dim) + # convert to numpy - data = np.asarray(da.transpose(..., dim)) + data = np.asarray(da) start = np.asarray(start) end = np.asarray(end) window = np.asarray(window) @@ -53,11 +56,6 @@ def tapered_selection(da, start, end, window, size=None, dim="last"): if not data.shape[:-1] == start.shape == end.shape: raise ValueError("shape mismatch between `da`, `start`, and `end`") - # make window even-sized (central value should be 1 so can be skipped) - if window.size % 2 != 0: - half_size = window.size // 2 - window = np.concatenate((window[:half_size], window[-half_size:])) - # select valid start/end mask = np.isfinite(start) & np.isfinite(end) selection = np.nonzero(mask)[0] @@ -75,6 +73,11 @@ def tapered_selection(da, start, end, window, size=None, dim="last"): if min(stopindex - startindex) < window.size: raise ValueError("some selected windows are smaller than the window size") + # make window even-sized (central value should be 1 so can be skipped) + if window.size % 2 != 0: + half_size = window.size // 2 + window = np.concatenate((window[:half_size], window[-half_size:])) + # perform tapered selection data = _tapered_selection( data, @@ -99,8 +102,9 @@ def tapered_selection(da, start, end, window, size=None, dim="last"): else: coords[name] = da[name][selection] + # return output DataArray - return xd.DataArray(data, coords=coords) + return xd.DataArray(data, coords=coords, dims=da.dims) @njit(parallel=True) From 6b15c8f2446bf28413d387375b753f4c9be46c87 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 2 Dec 2025 16:46:05 +0100 Subject: [PATCH 006/104] tapered_selection: add test_no_window, test_with_size and test_different_selection_lengths --- tests/test_picking.py | 109 +++++++++++++++++++++++++++++++++++++++++- xdas/picking.py | 7 ++- 2 files changed, 110 insertions(+), 6 deletions(-) diff --git a/tests/test_picking.py b/tests/test_picking.py index 81052cfd..7c86f02e 100644 --- a/tests/test_picking.py +++ b/tests/test_picking.py @@ -1,10 +1,9 @@ import numpy as np +import pytest import xdas as xd from xdas.picking import tapered_selection -import pytest - class TestTaperedSelection: def generate(self): @@ -84,6 +83,112 @@ def test_window_size_error(self): ): tapered_selection(da, start, end, window, dim="time") + def test_no_window(self): + da = self.generate() + + start = xd.DataArray( + data=np.array( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:03")] * 2 + + [np.datetime64("NaT")] * 2 + ), + coords={"distance": da["distance"]}, + ) + end = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:07")] * 2 + + [np.datetime64("NaT")] * 2 + ) + + result = tapered_selection(da, start, end, dim="time") + + expected = xd.DataArray( + data=[ + [13.0, 14.0, 15.0, 16.0, 17.0], + [23.0, 24.0, 25.0, 26.0, 27.0], + ], + coords={ + "distance": [100.0, 200.0], + "time": { + "tie_indices": [0, 4], + "tie_values": [0.0, 4.0], + }, + }, + ) + + assert result.equals(expected) + + def test_with_size(self): + da = self.generate() + + start = xd.DataArray( + data=np.array( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:03")] * 2 + + [np.datetime64("NaT")] * 2 + ), + coords={"distance": da["distance"]}, + ) + end = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:07")] * 2 + + [np.datetime64("NaT")] * 2 + ) + + result = tapered_selection(da, start, end, size=8, dim="time") + + expected = xd.DataArray( + data=[ + [13.0, 14.0, 15.0, 16.0, 17.0, 0.0, 0.0, 0.0], + [23.0, 24.0, 25.0, 26.0, 27.0, 0.0, 0.0, 0.0], + ], + coords={ + "distance": [100.0, 200.0], + "time": { + "tie_indices": [0, 7], + "tie_values": [0.0, 7.0], + }, + }, + ) + + assert result.equals(expected) + + def test_different_selection_lengths(self): + da = self.generate() + + start = xd.DataArray( + data=np.array( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:03")] + + [np.datetime64("2023-01-01T00:00:04")] + + [np.datetime64("NaT")] * 2 + ), + coords={"distance": da["distance"]}, + ) + end = ( + [np.datetime64("NaT")] + + [np.datetime64("2023-01-01T00:00:07")] * 2 + + [np.datetime64("NaT")] * 2 + ) + + result = tapered_selection(da, start, end, dim="time") + + expected = xd.DataArray( + data=[ + [13.0, 14.0, 15.0, 16.0, 17.0], + [24.0, 25.0, 26.0, 27.0, 0.0], + ], + coords={ + "distance": [100.0, 200.0], + "time": { + "tie_indices": [0, 4], + "tie_values": [0.0, 4.0], + }, + }, + ) + + assert result.equals(expected) + def test_other_dim(self): da = self.generate() diff --git a/xdas/picking.py b/xdas/picking.py index 1abfec8e..0996892d 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -5,7 +5,7 @@ import xdas as xd -def tapered_selection(da, start, end, window, size=None, dim="last"): +def tapered_selection(da, start, end, window=None, size=None, dim="last"): """ Selects and tapers a DataArray based on `start` and `end` values. @@ -31,7 +31,7 @@ def tapered_selection(da, start, end, window, size=None, dim="last"): fast length of the maximum selected window. dim : str, optional Dimension along which to perform the selection and tapering. Default is'last'. - window : array-like + window : array-like, optional Tapering window to apply to the selected data. Returns @@ -50,7 +50,7 @@ def tapered_selection(da, start, end, window, size=None, dim="last"): data = np.asarray(da) start = np.asarray(start) end = np.asarray(end) - window = np.asarray(window) + window = np.asarray(window if window is not None else []) # check shapes if not data.shape[:-1] == start.shape == end.shape: @@ -102,7 +102,6 @@ def tapered_selection(da, start, end, window, size=None, dim="last"): else: coords[name] = da[name][selection] - # return output DataArray return xd.DataArray(data, coords=coords, dims=da.dims) From f7cedb23864cd36005f8f3e5ecdbccd95494c805 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 2 Dec 2025 16:52:46 +0100 Subject: [PATCH 007/104] Add documentation for tapered_selection in picking.md --- docs/api/picking.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 docs/api/picking.md diff --git a/docs/api/picking.md b/docs/api/picking.md new file mode 100644 index 00000000..8216455a --- /dev/null +++ b/docs/api/picking.md @@ -0,0 +1,12 @@ +```{eval-rst} +.. currentmodule:: xdas.picking +``` + +# xdas.parallel + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + tapered_selection +``` \ No newline at end of file From d2f4ab8a34384984350b71997891032a146b22c2 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni <32540975+atrabattoni@users.noreply.github.com> Date: Tue, 2 Dec 2025 17:03:52 +0100 Subject: [PATCH 008/104] Update xdas/picking.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- xdas/picking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdas/picking.py b/xdas/picking.py index 0996892d..78f1cf25 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -30,7 +30,7 @@ def tapered_selection(da, start, end, window=None, size=None, dim="last"): Size of the output data along `dim`. If None, it is determined by the next fast length of the maximum selected window. dim : str, optional - Dimension along which to perform the selection and tapering. Default is'last'. + Dimension along which to perform the selection and tapering. Default is 'last'. window : array-like, optional Tapering window to apply to the selected data. From 2ef3674d137c6c593dc183b9dfd0f795e670965b Mon Sep 17 00:00:00 2001 From: Alister Trabattoni <32540975+atrabattoni@users.noreply.github.com> Date: Tue, 2 Dec 2025 17:04:11 +0100 Subject: [PATCH 009/104] Update xdas/picking.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- xdas/picking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xdas/picking.py b/xdas/picking.py index 78f1cf25..070fb853 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -22,10 +22,10 @@ def tapered_selection(da, start, end, window=None, size=None, dim="last"): dimensions. start : array-like Start values along the other dimension than `dim` (must be 1D and have the - same size) NaN or NaT values indicate coordinates to ignored. + same size) NaN or NaT values indicate coordinates to be ignored. end : array-like End values along the other dimension than `dim` (must be 1D and have the - same size) NaN or NaT values indicate coordinates to ignored. + same size) NaN or NaT values indicate coordinates to be ignored. size : int, optional Size of the output data along `dim`. If None, it is determined by the next fast length of the maximum selected window. From 3b39d3c1cc0f4116b1ec83bb3736b097dd72aa54 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni <32540975+atrabattoni@users.noreply.github.com> Date: Tue, 2 Dec 2025 17:04:26 +0100 Subject: [PATCH 010/104] Update docs/api/picking.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- docs/api/picking.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/api/picking.md b/docs/api/picking.md index 8216455a..9aa9bc95 100644 --- a/docs/api/picking.md +++ b/docs/api/picking.md @@ -2,7 +2,7 @@ .. currentmodule:: xdas.picking ``` -# xdas.parallel +# xdas.picking ```{eval-rst} .. autosummary:: From 364ebe44bff6b4f6e415bdb10c446b3ff34b7e72 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 2 Dec 2025 17:09:30 +0100 Subject: [PATCH 011/104] Fix the no valid selection case. --- tests/test_picking.py | 10 ++++++++++ xdas/picking.py | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/tests/test_picking.py b/tests/test_picking.py index 7c86f02e..bec808a0 100644 --- a/tests/test_picking.py +++ b/tests/test_picking.py @@ -216,3 +216,13 @@ def test_other_dim(self): ) assert result.equals(expected) + + def test_no_valid_selections(self): + da = self.generate() + + start = [np.datetime64("NaT")] * 5 + end = [np.datetime64("NaT")] * 5 + window = [0.5, 1.0, 0.5] + + with pytest.raises(ValueError, match="No valid start/end pairs found"): + tapered_selection(da, start, end, window, dim="time") diff --git a/xdas/picking.py b/xdas/picking.py index 070fb853..76c329f6 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -60,6 +60,10 @@ def tapered_selection(da, start, end, window=None, size=None, dim="last"): mask = np.isfinite(start) & np.isfinite(end) selection = np.nonzero(mask)[0] + # check selection + if selection.size == 0: + raise ValueError("No valid start/end pairs found") + # get selection indices startindex = da[dim].get_indexer(start[selection], method="bfill") endindex = da[dim].get_indexer(end[selection], method="ffill") From fb3ab4517815f8b894db26f3e9f27c9687787a19 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 16 Dec 2025 09:18:15 +0100 Subject: [PATCH 012/104] Fix `irfft` default output length `n` and improves `fft` docstrings. --- tests/test_fft.py | 17 +++++++ xdas/fft.py | 113 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 95 insertions(+), 35 deletions(-) diff --git a/tests/test_fft.py b/tests/test_fft.py index f76d3a13..8b403120 100644 --- a/tests/test_fft.py +++ b/tests/test_fft.py @@ -45,3 +45,20 @@ def test_real(self): assert np.allclose(result["time"].values, ref) else: assert result[name].equals(expected[name]) + + def test_real_default_n(self): + expected = xd.synthetics.wavelet_wavefronts() + expected = expected.isel(time=slice(0, expected.sizes["time"] // 2 * 2)) + result = xfft.irfft( + xfft.rfft(expected, dim={"time": "frequency"}), + dim={"frequency": "time"}, + ) + assert np.allclose(result.values, expected.values) + for name in result.coords: + if name == "time": + ref = expected["time"].values + ref = (ref - ref[0]) / np.timedelta64(1, "s") + ref += result["time"][0].values + assert np.allclose(result["time"].values, ref) + else: + assert result[name].equals(expected[name]) diff --git a/xdas/fft.py b/xdas/fft.py index d4364c94..5d6ca31d 100644 --- a/xdas/fft.py +++ b/xdas/fft.py @@ -19,10 +19,10 @@ def fft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): da: DataArray The data array to process, can be complex. n: int, optional - Length of transformed dimension of the output. If n is smaller than the length - of the input, the input is cropped. If it is larger, the input is padded with - zeros. If n is not given, the length of the input along the dimension specified - by `dim` is used. + Length of transformed dimension of the output. If `n` is smaller than the + length of the input, the input is cropped. If it is larger, the input is + padded with zeros. If `n` is not given, the length of the input along the + dimension specified by `dim` is used. dim: {str: str}, optional A mapping indicating as a key the dimension along which to compute the FFT, and as value the new name of the dimension. Default to {"last": "spectrum"}. @@ -38,9 +38,20 @@ def fft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): Notes ----- - To perform a multidimensional fourrier transform, repeat this function on the + To perform a multidimensional fourrier operations, repeat this function on the desired dimensions. + Examples + -------- + >>> import xdas as xd + >>> import xdas.fft as xfft + >>> signal = xd.DataArray([0., 1., 0., -1.], coords={"time": [0, 1, 2, 3]}) + >>> xfft.fft(signal, dim={"time": "frequency"}) + + [0.+0.j 0.+2.j 0.+0.j 0.-2.j] + Coordinates: + * frequency (frequency): [-0.5 ... 0.25] + """ ((olddim, newdim),) = dim.items() olddim = da.dims[da.get_axis_num(olddim)] @@ -75,10 +86,10 @@ def rfft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): da: DataArray The data array to process, can be complex. n: int, optional - Length of transformed dimension of the output. If n is smaller than the length - of the input, the input is cropped. If it is larger, the input is padded with - zeros. If n is not given, the length of the input along the dimension specified - by `dim` is used. + Length of transformed dimension of the output. If `n` is smaller than the + length of the input, the input is cropped. If it is larger, the input is + padded with zeros. If `n` is not given, the length of the input along the + dimension specified by `dim` is used. dim: {str: str}, optional A mapping indicating as a key the dimension along which to compute the FFT, and as value the new name of the dimension. Default to {"last": "spectrum"}. @@ -95,9 +106,20 @@ def rfft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): Notes ----- - To perform a multidimensional fourrier transform, repeat this function on the + To perform a multidimensional fourrier operations, repeat this function on the desired dimensions. + Examples + -------- + >>> import xdas as xd + >>> import xdas.fft as xfft + >>> signal = xd.DataArray([0., 1., 0., -1.], coords={"time": [0, 1, 2, 3]}) + >>> xfft.rfft(signal, dim={"time": "frequency"}) + + [0.+0.j 0.-2.j 0.+0.j] + Coordinates: + * frequency (frequency): [0. ... 0.5] + """ ((olddim, newdim),) = dim.items() olddim = da.dims[da.get_axis_num(olddim)] @@ -119,23 +141,19 @@ def rfft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): @atomized -def ifft(da, n=None, dim={"last": "time"}, norm=None, parallel=None): +def ifft(da, n=None, dim={"last": "signal"}, norm=None, parallel=None): """ - Compute the inverse discrete Fourier Transform along a given dimension. - - This function computes the inverse of the one-dimensional n-point discrete Fourier - transform computed by fft. In other words, ifft(fft(a)) == a to within numerical - accuracy. + Compute the inverse of `fft`. Parameters ---------- da: DataArray The data array to process, should be complex. n: int, optional - Length of transformed dimension of the output. If n is smaller than the length - of the input, the input is cropped. If it is larger, the input is padded with - zeros. If n is not given, the length of the input along the dimension specified - by `dim` is used. + Length of transformed dimension of the output. If `n` is smaller than the + length of the input, the input is cropped. If it is larger, the input is + padded with zeros. If `n` is not given, the length of the input along the + dimension specified by `dim` is used. dim: {str: str}, optional A mapping indicating as a key the dimension along which to compute the IFFT, and as value the new name of the dimension. Default to {"last": "time"}. @@ -151,9 +169,19 @@ def ifft(da, n=None, dim={"last": "time"}, norm=None, parallel=None): Notes ----- - To perform a multidimensional inverse fourrier transform, repeat this function on + To perform a multidimensional inverse fourrier operations, repeat this function on the desired dimensions. + Examples + -------- + >>> import xdas as xd + >>> import xdas.fft as xfft + >>> signal = xd.DataArray([0., 1., 0., -1.], coords={"time": [0, 1, 2, 3]}) + >>> spectrum = xfft.fft(signal, dim={"time": "frequency"}) + >>> result = xfft.ifft(spectrum, dim={"frequency": "time"}) + >>> result["time"] = signal["time"] # to match time coordinates + >>> assert np.real(result).equals(signal) + """ ((olddim, newdim),) = dim.items() olddim = da.dims[da.get_axis_num(olddim)] @@ -176,22 +204,20 @@ def ifft(da, n=None, dim={"last": "time"}, norm=None, parallel=None): @atomized -def irfft(da, n=None, dim={"last": "time"}, norm=None, parallel=None): +def irfft(da, n=None, dim={"last": "signal"}, norm=None, parallel=None): """ - Compute the discrete Fourier Transform for real inputs along a given dimension. - - This function computes the one-dimensional n-point discrete Fourier Transform (DFT) - or real-valued inputs with the efficient Fast Fourier Transform (FFT) algorithm. + Computes the inverse of `rfft`. Parameters ---------- da: DataArray The data array to process, can be complex. - n: int, optional - Length of transformed dimension of the output. If n is smaller than the length - of the input, the input is cropped. If it is larger, the input is padded with - zeros. If n is not given, the length of the input along the dimension specified - by `dim` is used. + n : int, optional + Length of the transformed dimension of the output. For `n` output points, + ``n//2+1`` input points are necessary. If the input is longer than this, it is + cropped. If it is shorter than this, it is padded with zeros. If `n` is not + given, it is taken to be ``2*(m-1)`` where ``m`` is the length of the input + along the dimension specified by `dim`. dim: {str: str}, optional A mapping indicating as a key the dimension along which to compute the FFT, and as value the new name of the dimension. Default to {"last": "time"}. @@ -203,19 +229,36 @@ def irfft(da, n=None, dim={"last": "time"}, norm=None, parallel=None): Returns ------- DataArray: - The transformed input with an updated dimension name and values. The length of - the transformed dimension is (n/2)+1 if n is even or (n+1)/2 if n is odd. + The truncated or zero-padded input, transformed along the dimension indicated + by `dim`, or the last one if `dim` is not specified. The length of the + transformed dimension is `n`, or, if `n` is not given, ``2*(m-1)`` where ``m`` + is the length of the transformed dimension of the input. To get an odd number + of output points, `n` must be specified. Notes ----- - To perform a multidimensional fourrier transform, repeat this function on the + To perform a multidimensional fourrier operations, repeat this function on the desired dimensions. + Examples + -------- + >>> import xdas as xd + >>> import xdas.fft as xfft + >>> signal = xd.DataArray([0., 1., 0., -1.], coords={"time": [0, 1, 2, 3]}) + >>> spectrum = xfft.rfft(signal, dim={"time": "frequency"}) + >>> result = xfft.irfft( + ... spectrum, + ... n=signal.sizes["time"], # ensure correct output if n is odd + ... dim={"frequency": "time"}, + ... ) + >>> result["time"] = signal["time"] # to match time coordinates + >>> assert np.real(result).equals(signal) + """ ((olddim, newdim),) = dim.items() olddim = da.dims[da.get_axis_num(olddim)] if n is None: - n = da.sizes[olddim] + n = (da.sizes[olddim] - 1) * 2 axis = da.get_axis_num(olddim) d = get_sampling_interval(da, olddim) across = int(axis == 0) From 3ffb6be6eaeebe45846d4457abb9f38f95b3a86d Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 16 Dec 2025 19:48:21 +0100 Subject: [PATCH 013/104] Add Coordinate.isdim and .name. Fix Coordinate.to_dataarray(). --- tests/test_coordinates.py | 55 +++++++++++++++++++++++++++++++++++++++ tests/test_dataarray.py | 5 +++- xdas/core/coordinates.py | 46 +++++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 4 deletions(-) diff --git a/tests/test_coordinates.py b/tests/test_coordinates.py index 85f5c85a..0b295fa7 100644 --- a/tests/test_coordinates.py +++ b/tests/test_coordinates.py @@ -557,6 +557,61 @@ def test_empty(self): with pytest.raises(TypeError, match="cannot infer coordinate type"): xdas.Coordinate() + def test_isdim(self): + coord = xdas.Coordinate([1, 2, 3]) + assert coord.isdim() is None + coord = xdas.Coordinate([1, 2, 3], "dim") + assert coord.isdim() is None + coords = xdas.Coordinates({"dim": coord}) + assert coords["dim"].isdim() + coords = xdas.Coordinates({"other_dim": coord}) + assert not coords["other_dim"].isdim() + + def test_name(self): + coord = xdas.Coordinate([1, 2, 3]) + assert coord.name is None + coord = xdas.Coordinate([1, 2, 3], "dim") + assert coord.name == "dim" + coords = xdas.Coordinates({"dim": coord}) + assert coords["dim"].name == "dim" + coords = xdas.Coordinates({"other_dim": coord}) + assert coords["other_dim"].name == "other_dim" + + def test_to_dataarray(self): + coord = xdas.Coordinate([1, 2, 3]) + with pytest.raises(ValueError, match="unnamed coordinate"): + coord.to_dataarray() + coord = xdas.Coordinate([1, 2, 3], "dim") + result = coord.to_dataarray() + expected = xdas.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") + assert result.equals(expected) + coords = xdas.Coordinates({"dim": coord}) + result = coords["dim"].to_dataarray() + assert result.equals(expected) + coords = xdas.Coordinates({"other_dim": coord}) + result = coords["other_dim"].to_dataarray() + expected = xdas.DataArray( + [1, 2, 3], coords={"other_dim": coord}, dims=["dim"], name="other_dim" + ) + assert result.equals(expected) + coords["dim"] = [4, 5, 6] + result = coords["dim"].to_dataarray() + expected = xdas.DataArray( + [4, 5, 6], + coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, + dims=["dim"], + name="dim", + ) + assert result.equals(expected) + result = coords["other_dim"].to_dataarray() + expected = xdas.DataArray( + [1, 2, 3], + coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, + dims=["dim"], + name="other_dim", + ) + assert result.equals(expected) + class TestCoordinates: def test_init(self): diff --git a/tests/test_dataarray.py b/tests/test_dataarray.py index db49e4e8..a0a9c0bf 100644 --- a/tests/test_dataarray.py +++ b/tests/test_dataarray.py @@ -67,7 +67,10 @@ def test_init_and_properties(self): def test_raises_on_data_and_coords_mismatch(self): with pytest.raises(ValueError, match="different number of dimensions"): DataArray(np.zeros(3), dims=("time", "distance")) - with pytest.raises(ValueError, match="infered dimension number from `coords`"): + with pytest.raises( + ValueError, + match="inferred number of dimensions 2 from `coords` does not match `data` dimensionality of 1", + ): DataArray(np.zeros(3), coords={"time": [1], "distance": [1]}) with pytest.raises(ValueError, match="conflicting sizes for dimension"): DataArray(np.zeros((2, 3)), coords={"time": [1, 2], "distance": [1, 2]}) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index f87793f2..8f251821 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -98,6 +98,7 @@ def __setitem__(self, key, value): f"conflicting sizes for dimension {coord.dim}: size {len(coord)} " f"in `coords` and size {size} in `data`" ) + coord._parent = self return super().__setitem__(key, coord) def __repr__(self): @@ -232,8 +233,8 @@ def drop_coords(self, *names): def _assign_parent(self, parent): if not len(self.dims) == parent.ndim: raise ValueError( - "infered dimension number from `coords` does not match " - "`data` dimensionality`" + f"inferred number of dimensions {len(self.dims)} from `coords` does " + f"not match `data` dimensionality of {parent.ndim}" ) for dim, size in zip(self.dims, parent.shape): if (dim in self) and (not len(self[dim]) == size): @@ -336,7 +337,27 @@ def append(self, other): def to_dataarray(self): from .dataarray import DataArray # TODO: avoid defered import? - return DataArray(self.values, {self.dim: self}, name=self.dim) + if self.name is None: + raise ValueError("cannot convert unnamed coordinate to DataArray") + + if self.parent is None: + return DataArray( + self.values, + {self.dim: self}, + dims=[self.dim], + name=self.name, + ) + else: + return DataArray( + self.values, + { + name: coord + for name, coord in self.parent.items() + if coord.dim == self.dim + }, + dims=[self.dim], + name=self.name, + ) def to_dict(self): raise NotImplementedError @@ -345,6 +366,25 @@ def to_dict(self): def from_dict(cls, dct): return cls(**dct) + def __reduce__(self): + return self.__class__, (self.data, self.dim), {"_parent": self.parent} + + @property + def parent(self): + return getattr(self, "_parent", None) + + @property + def name(self): + if self.parent is None: + return self.dim + return next((name for name in self.parent if self.parent[name] is self), None) + + def isdim(self): + if self.parent is None or self.name is None: + return None + else: + return self.parent.isdim(self.name) + class ScalarCoordinate(Coordinate): def __new__(cls, *args, **kwargs): From a70ad5acf0f3fef61ba8a7cb4fbc32a5da6e7d6a Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 08:33:23 +0100 Subject: [PATCH 014/104] rename tie_samples -> tie_lengths --- xdas/core/coordinates.py | 70 ++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 1f712024..66edd5c3 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -980,43 +980,43 @@ def __new__(cls, *args, **kwargs): def __init__(self, data=None, dim=None, dtype=None): if data is None: - data = {"tie_values": [], "tie_samples": [], "sampling_interval": None} + data = {"tie_values": [], "tie_lengths": [], "sampling_interval": None} data, dim = parse(data, dim) if not self.__class__.isvalid(data): raise TypeError("`data` must be dict-like") - if not set(data) == {"tie_values", "tie_samples", "sampling_interval"}: + if not set(data) == {"tie_values", "tie_lengths", "sampling_interval"}: raise ValueError( - "keys `tie_values`, `tie_samples`, and `sampling_interval` must be provided" + "keys `tie_values`, `tie_lengths`, and `sampling_interval` must be provided" ) tie_values = np.asarray(data["tie_values"], dtype=dtype) - tie_samples = np.asarray(data["tie_samples"]) + tie_lengths = np.asarray(data["tie_lengths"]) sampling_interval = np.asarray(data["sampling_interval"]) if not tie_values.ndim == 1: raise ValueError("`tie_values` must be 1D") - if not tie_samples.ndim == 1: - raise ValueError("`tie_samples` must be 1D") - if not len(tie_values) == len(tie_samples): - raise ValueError("`tie_values` and `tie_samples` must have the same length") + if not tie_lengths.ndim == 1: + raise ValueError("`tie_lengths` must be 1D") + if not len(tie_values) == len(tie_lengths): + raise ValueError("`tie_values` and `tie_lengths` must have the same length") if not ( np.issubdtype(tie_values.dtype, np.number) or np.issubdtype(tie_values.dtype, np.datetime64) ): raise ValueError("`tie_values` must have either numeric or datetime dtype") if not self.empty: - if not np.issubdtype(tie_samples.dtype, np.integer): - raise ValueError("`tie_samples` must be integer-like") - if not np.all(tie_samples > 0): - raise ValueError("`tie_samples` must be positive integers") + if not np.issubdtype(tie_lengths.dtype, np.integer): + raise ValueError("`tie_lengths` must be integer-like") + if not np.all(tie_lengths > 0): + raise ValueError("`tie_lengths` must be positive integers") if not np.isscalar(sampling_interval): raise ValueError("`sampling_interval` must be a scalar value") if np.issubdtype(sampling_interval.dtype, np.datetime64): sampling_interval = sampling_interval.astype("timedelta64[ns]") else: sampling_interval = np.asarray(sampling_interval).astype(dtype) - tie_samples = tie_samples.astype(int) + tie_lengths = tie_lengths.astype(int) self.data = dict( tie_values=tie_values, - tie_samples=tie_samples, + tie_lengths=tie_lengths, sampling_interval=sampling_interval, ) self.dim = dim @@ -1026,7 +1026,7 @@ def isvalid(data): match data: case { "tie_values": _, - "tie_samples": _, + "tie_lengths": _, "sampling_interval": _, }: return True @@ -1037,7 +1037,7 @@ def __len__(self): if self.empty: return 0 else: - return sum(self.tie_samples) + return sum(self.tie_lengths) def __repr__(self): if self.empty: @@ -1066,7 +1066,7 @@ def __add__(self, other): return self.__class__( { "tie_values": self.tie_values + other, - "tie_samples": self.tie_samples, + "tie_lengths": self.tie_lengths, "sampling_interval": self.sampling_interval, }, self.dim, @@ -1076,7 +1076,7 @@ def __sub__(self, other): return self.__class__( { "tie_values": self.tie_values - other, - "tie_samples": self.tie_samples, + "tie_lengths": self.tie_lengths, "sampling_interval": self.sampling_interval, }, self.dim, @@ -1099,8 +1099,8 @@ def tie_values(self): return self.data["tie_values"] @property - def tie_samples(self): - return self.data["tie_samples"] + def tie_lengths(self): + return self.data["tie_lengths"] @property def sampling_interval(self): @@ -1108,7 +1108,7 @@ def sampling_interval(self): @property def tie_indices(self): - return np.concatenate(([0], np.cumsum(self.tie_samples[:-1]))) + return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) @property def empty(self): @@ -1146,12 +1146,12 @@ def start(self): @property def end(self): - return self.tie_values[-1] + self.sampling_interval * self.tie_samples[-1] + return self.tie_values[-1] + self.sampling_interval * self.tie_lengths[-1] def equals(self, other): return ( np.array_equal(self.tie_values, other.tie_values) - and np.array_equal(self.tie_samples, other.tie_samples) + and np.array_equal(self.tie_lengths, other.tie_lengths) and self.sampling_interval == other.sampling_interval and self.dim == other.dim and self.dtype == other.dtype @@ -1177,18 +1177,18 @@ def slice_index(self, index_slice): return self.__class__( dict( tie_values=[], - tie_samples=[], + tie_lengths=[], sampling_interval=self.sampling_interval, ), self.dim, ) elif (stop_index - start_index) <= step_index: tie_values = [self.get_value(start_index)] - tie_samples = [stop_index - start_index] + tie_lengths = [stop_index - start_index] return self.__class__( dict( tie_values=tie_values, - tie_samples=tie_samples, + tie_lengths=tie_lengths, sampling_interval=self.sampling_interval, ), self.dim, @@ -1197,7 +1197,7 @@ def slice_index(self, index_slice): # keep tie values, number of samples and related tie indices contained in the slice mask = (start_index < self.tie_indices) & (self.tie_indices <= stop_index) tie_values = self.tie_values[mask] - tie_samples = self.tie_samples[mask] + tie_lengths = self.tie_lengths[mask] tie_indices = self.tie_indices[mask] # insert the missing start value @@ -1205,15 +1205,15 @@ def slice_index(self, index_slice): tie_values = np.concatenate([[start_value], self.tie_values[mask]]) # insert the missing start number of samples and adjust the end one - tie_samples = np.concatenate( - [[start_index - tie_indices[0]], tie_samples[mask]] + tie_lengths = np.concatenate( + [[start_index - tie_indices[0]], tie_lengths[mask]] ) - tie_samples[-1] = stop_index - tie_indices[-1] + tie_lengths[-1] = stop_index - tie_indices[-1] # repack data and decimate if needed data = { "tie_values": tie_values, - "tie_samples": tie_samples, + "tie_lengths": tie_lengths, "sampling_interval": self.sampling_interval, } coord = self.__class__(data, self.dim) @@ -1259,11 +1259,11 @@ def append(self, other): "cannot append coordinate with different sampling intervals" ) tie_values = np.concatenate([self.tie_values, other.tie_values]) - tie_samples = np.concatenate([self.tie_samples, other.tie_samples + len(self)]) + tie_lengths = np.concatenate([self.tie_lengths, other.tie_lengths + len(self)]) return self.__class__( { "tie_values": tie_values, - "tie_samples": tie_samples, + "tie_lengths": tie_lengths, "sampling_interval": self.sampling_interval, }, self.dim, @@ -1293,12 +1293,12 @@ def from_array(cls, arr, dim=None, sampling_interval=None): def to_dict(self): tie_values = self.data["tie_values"] - tie_samples = self.data["tie_samples"] + tie_lengths = self.data["tie_lengths"] if np.issubdtype(tie_values.dtype, np.datetime64): tie_values = tie_values.astype(str) data = { "tie_values": tie_values.tolist(), - "tie_samples": tie_samples.tolist(), + "tie_lengths": tie_lengths.tolist(), "sampling_interval": self.sampling_interval, } return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} From e0147e8ac0d696415a5cd6fee2e3544ec7a5950e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 09:28:02 +0100 Subject: [PATCH 015/104] Improve SampleCoordinate __ini__ parsing. --- xdas/core/coordinates.py | 43 +++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index e3457296..2a89406b 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1034,41 +1034,52 @@ def __new__(cls, *args, **kwargs): return object.__new__(cls) def __init__(self, data=None, dim=None, dtype=None): + # empty if data is None: data = {"tie_values": [], "tie_lengths": [], "sampling_interval": None} + empty = True + else: + empty = False + + # parse data data, dim = parse(data, dim) if not self.__class__.isvalid(data): - raise TypeError("`data` must be dict-like") - if not set(data) == {"tie_values", "tie_lengths", "sampling_interval"}: - raise ValueError( - "keys `tie_values`, `tie_lengths`, and `sampling_interval` must be provided" + raise TypeError( + "`data` must be dict-like and contain `tie_values`, `tie_lengths`, and " + "`sampling_interval`" ) tie_values = np.asarray(data["tie_values"], dtype=dtype) tie_lengths = np.asarray(data["tie_lengths"]) sampling_interval = np.asarray(data["sampling_interval"]) + + # check shapes if not tie_values.ndim == 1: raise ValueError("`tie_values` must be 1D") if not tie_lengths.ndim == 1: raise ValueError("`tie_lengths` must be 1D") if not len(tie_values) == len(tie_lengths): raise ValueError("`tie_values` and `tie_lengths` must have the same length") - if not ( - np.issubdtype(tie_values.dtype, np.number) - or np.issubdtype(tie_values.dtype, np.datetime64) - ): - raise ValueError("`tie_values` must have either numeric or datetime dtype") - if not self.empty: + + # check dtypes + if not empty: + if not ( + np.issubdtype(tie_values.dtype, np.number) + or np.issubdtype(tie_values.dtype, np.datetime64) + ): + raise ValueError( + "`tie_values` must have either numeric or datetime dtype" + ) if not np.issubdtype(tie_lengths.dtype, np.integer): raise ValueError("`tie_lengths` must be integer-like") if not np.all(tie_lengths > 0): - raise ValueError("`tie_lengths` must be positive integers") + raise ValueError("`tie_lengths` must be strictly positive integers") if not np.isscalar(sampling_interval): raise ValueError("`sampling_interval` must be a scalar value") - if np.issubdtype(sampling_interval.dtype, np.datetime64): - sampling_interval = sampling_interval.astype("timedelta64[ns]") - else: - sampling_interval = np.asarray(sampling_interval).astype(dtype) - tie_lengths = tie_lengths.astype(int) + if np.issubdtype(tie_values.dtype, np.datetime64): + if not np.issubdtype(sampling_interval.dtype, np.timedelta64): + raise ValueError( + "`sampling_interval` must be timedelta64 for datetime64 `tie_values`" + ) self.data = dict( tie_values=tie_values, tie_lengths=tie_lengths, From 8e8ff55737db2a1dc3757ef5b747ebff90ea2f4f Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 09:28:41 +0100 Subject: [PATCH 016/104] add get_sampling_interval suppor SampleCoordinate. --- xdas/core/coordinates.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 2a89406b..652f0a55 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1413,6 +1413,8 @@ def get_sampling_interval(da, dim, cast=True): num = num[mask] den = den[mask] d = np.median(num / den) + elif isinstance(coord, SampledCoordinate): + d = coord.sampling_interval else: d = (coord[-1].values - coord[0].values) / (len(coord) - 1) d = np.asarray(d) From edead467726675a3e589914cb1abcdbb45b63b8c Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 10:05:48 +0100 Subject: [PATCH 017/104] Refactor Coordinate to_netcdf. --- xdas/core/coordinates.py | 31 ++++++++++++++++++++++ xdas/core/dataarray.py | 55 ++++++++++++++-------------------------- 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 652f0a55..57f35d9f 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -362,6 +362,12 @@ def to_dataarray(self): def to_dict(self): raise NotImplementedError + def to_netcdf(self, ds, attrs): + ds = ds.assign_coords( + {self.name: (self.dim, self.values) if self.dim else self.values} + ) + return ds, attrs + @classmethod def from_dict(cls, dct): return cls(**dct) @@ -1015,6 +1021,31 @@ def to_dict(self): } return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + def to_netcdf(self, ds, attrs): + mapping = f"{self.name}: {self.name}_indices {self.name}_values" + if "coordinate_interpolation" in attrs: + attrs["coordinate_interpolation"] += " " + mapping + else: + attrs["coordinate_interpolation"] = mapping + tie_indices = self.tie_indices + tie_values = ( + self.tie_values.astype("M8[ns]") + if np.issubdtype(self.tie_values.dtype, np.datetime64) + else self.tie_values + ) + interp_attrs = { + "interpolation_name": "linear", + "tie_points_mapping": f"{self.name}_points: {self.name}_indices {self.name}_values", + } + ds.update( + { + f"{self.name}_interpolation": ((), np.nan, interp_attrs), + f"{self.name}_indices": (f"{self.name}_points", tie_indices), + f"{self.name}_values": (f"{self.name}_points", tie_values), + } + ) + return ds, attrs + class SampledCoordinate(Coordinate): """ diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 2dd762a6..b63e7591 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -874,36 +874,17 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): """ if virtual is None: virtual = isinstance(self.data, (VirtualArray, DaskArray)) + + # initialize ds = xr.Dataset(attrs={"Conventions": "CF-1.9"}) - mappings = [] - for name, coord in self.coords.items(): - if coord.isinterp(): - mappings.append(f"{name}: {name}_indices {name}_values") - tie_indices = coord.tie_indices - tie_values = ( - coord.tie_values.astype("M8[ns]") - if np.issubdtype(coord.tie_values.dtype, np.datetime64) - else coord.tie_values - ) - attrs = { - "interpolation_name": "linear", - "tie_points_mapping": f"{name}_points: {name}_indices {name}_values", - } - ds.update( - { - f"{name}_interpolation": ((), np.nan, attrs), - f"{name}_indices": (f"{name}_points", tie_indices), - f"{name}_values": (f"{name}_points", tie_values), - } - ) - else: - ds = ds.assign_coords( - {name: (coord.dim, coord.values) if coord.dim else coord.values} - ) - mapping = " ".join(mappings) - attrs = {} if self.attrs is None else self.attrs - attrs |= {"coordinate_interpolation": mapping} if mapping else attrs - name = "__values__" if self.name is None else self.name + variable_attrs = {} if self.attrs is None else self.attrs + variable_name = "__values__" if self.name is None else self.name + + # prepare metadata + for coord in self.coords.values(): + ds, variable_attrs = coord.to_netcdf(ds, variable_attrs) + + # write data with h5netcdf.File(fname, mode=mode) as file: if group is not None and group not in file: file.create_group(group) @@ -912,7 +893,7 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): if not virtual: encoding = {} if encoding is None else encoding variable = file.create_variable( - name, + variable_name, self.dims, self.dtype, data=self.values, @@ -922,15 +903,15 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): if encoding is not None: raise ValueError("cannot use `encoding` with in virtual mode") if isinstance(self.data, VirtualArray): - self.data.to_dataset(file._h5group, name) - variable = file._variable_cls(file, name, self.dims) - file._variables[name] = variable + self.data.to_dataset(file._h5group, variable_name) + variable = file._variable_cls(file, variable_name, self.dims) + file._variables[variable_name] = variable variable._attach_dim_scales() variable._attach_coords() variable._ensure_dim_id() elif isinstance(self.data, DaskArray): variable = file.create_variable( - name, + variable_name, self.dims, self.dtype, ) @@ -941,8 +922,10 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): raise ValueError( "can only use `virtual=True` with a virtual array as data" ) - if attrs: - variable.attrs.update(attrs) + if variable_attrs: + variable.attrs.update(variable_attrs) + + # add metadata ds.to_netcdf(fname, mode="a", group=group, engine="h5netcdf") @classmethod From dbf3d2ebf3c239f68b51906d3e636e5ae37dbe8f Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 11:00:55 +0100 Subject: [PATCH 018/104] Refactor DataArray.to_netcdf: extract create_variable for virtual backend. --- xdas/core/dataarray.py | 29 ++++++++++++++--------------- xdas/dask/__init__.py | 2 +- xdas/dask/core.py | 7 +++++++ xdas/virtual.py | 9 +++++++++ 4 files changed, 31 insertions(+), 16 deletions(-) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index b63e7591..d2857bd3 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -12,7 +12,7 @@ from dask.array import Array as DaskArray from numpy.lib.mixins import NDArrayOperatorsMixin -from ..dask.core import dumps, from_dict, loads, to_dict +from ..dask.core import create_variable, from_dict, loads, to_dict from ..virtual import VirtualArray, VirtualSource, _to_human from .coordinates import Coordinate, Coordinates, get_sampling_interval @@ -886,10 +886,15 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): # write data with h5netcdf.File(fname, mode=mode) as file: + # group if group is not None and group not in file: file.create_group(group) file = file if group is None else file[group] + + # dims file.dimensions.update(self.sizes) + + # variable if not virtual: encoding = {} if encoding is None else encoding variable = file.create_variable( @@ -903,29 +908,23 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): if encoding is not None: raise ValueError("cannot use `encoding` with in virtual mode") if isinstance(self.data, VirtualArray): - self.data.to_dataset(file._h5group, variable_name) - variable = file._variable_cls(file, variable_name, self.dims) - file._variables[variable_name] = variable - variable._attach_dim_scales() - variable._attach_coords() - variable._ensure_dim_id() - elif isinstance(self.data, DaskArray): - variable = file.create_variable( - variable_name, - self.dims, - self.dtype, + variable = self.data.create_variable( + file, variable_name, self.dims, self.dtype ) - variable.attrs.update( - {"__dask_array__": np.frombuffer(dumps(self.data), "uint8")} + elif isinstance(self.data, DaskArray): + variable = create_variable( + self.data, file, variable_name, self.dims, self.dtype ) else: raise ValueError( "can only use `virtual=True` with a virtual array as data" ) + + # attrs if variable_attrs: variable.attrs.update(variable_attrs) - # add metadata + # write metadata ds.to_netcdf(fname, mode="a", group=group, engine="h5netcdf") @classmethod diff --git a/xdas/dask/__init__.py b/xdas/dask/__init__.py index 60e4af0a..4f612c92 100644 --- a/xdas/dask/__init__.py +++ b/xdas/dask/__init__.py @@ -1 +1 @@ -from .core import dumps, loads +from .core import create_variable, dumps, loads diff --git a/xdas/dask/core.py b/xdas/dask/core.py index 23972237..911145ad 100644 --- a/xdas/dask/core.py +++ b/xdas/dask/core.py @@ -1,8 +1,15 @@ +import numpy as np from dask.array import Array from . import serial +def create_variable(arr, file, name, dims=None, dtype=None): + variable = file.create_variable(name, dims, dtype) + variable.attrs.update({"__dask_array__": np.frombuffer(dumps(arr), "uint8")}) + return variable + + def dumps(arr): """Serialize a dask array.""" return serial.dumps(to_dict(arr)) diff --git a/xdas/virtual.py b/xdas/virtual.py index e5ce9aa2..92aba6b4 100644 --- a/xdas/virtual.py +++ b/xdas/virtual.py @@ -49,6 +49,15 @@ def nbytes(self): else: return 0 + def create_variable(self, file, name, dims=None, dtype=None): + self.to_dataset(file._h5group, name) + variable = file._variable_cls(file, name, dims) + file._variables[name] = variable + variable._attach_dim_scales() + variable._attach_coords() + variable._ensure_dim_id() + return variable + class VirtualStack(VirtualArray): def __init__(self, sources=[], axis=0): From 32f7edfafa896756b24941067dd2bdf1ee04693b Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 11:49:18 +0100 Subject: [PATCH 019/104] Refactor DataArray.from_netcdf. --- xdas/core/dataarray.py | 64 +++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index d2857bd3..4be0f545 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -944,55 +944,49 @@ def from_netcdf(cls, fname, group=None): DataArray The openend data array. """ + # read metadata with xr.open_dataset(fname, group=group, engine="h5netcdf") as ds: + # check file format if not ("Conventions" in ds.attrs and "CF" in ds.attrs["Conventions"]): raise TypeError( "file format not recognized. please provide the file format " "with the `engine` keyword argument" ) + + # identify the "main" data array if len(ds) == 1: name, da = next(iter(ds.items())) - coords = { - name: ( - ( - coord.dims[0], - ( - coord.values.astype("U") - if coord.dtype == np.dtype("O") - else coord.values - ), - ) - if coord.dims - else coord.values - ) - for name, coord in da.coords.items() - } else: - data_vars = [ - var - for var in ds.values() - if "coordinate_interpolation" in var.attrs - ] + data_vars = { + name: var + for name, var in ds.items() + if any("coordinate" in attr for attr in var.attrs) + } if len(data_vars) == 1: - da = data_vars[0] + name, da = next(iter(data_vars.items())) else: raise ValueError("several possible data arrays detected") - coords = { - name: ( + + # read regular coordinates + coords = { + name: ( + ( + coord.dims[0], ( - coord.dims[0], - ( - coord.values.astype("U") - if coord.dtype == np.dtype("O") - else coord.values - ), - ) - if coord.dims - else coord.values + coord.values.astype("U") + if coord.dtype == np.dtype("O") + else coord.values + ), ) - for name, coord in da.coords.items() - } - mapping = da.attrs.pop("coordinate_interpolation") + if coord.dims + else coord.values + ) + for name, coord in da.coords.items() + } + + # read advanced coordinates + mapping = da.attrs.pop("coordinate_interpolation", None) + if mapping is not None: matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) for match in matches: dim, indices, values = match From fb9ee294467fe2f3f88f1950aa2233329752e109 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 14:06:50 +0100 Subject: [PATCH 020/104] Refactor: use __subclasses__ when iterating on possible type of Coordinate. --- xdas/core/coordinates.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 57f35d9f..ec864fab 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -250,14 +250,10 @@ def __new__(cls, data=None, dim=None, dtype=None): if data is None: raise TypeError("cannot infer coordinate type if no `data` is provided") data, dim = parse(data, dim) - if ScalarCoordinate.isvalid(data): - return object.__new__(ScalarCoordinate) - elif DenseCoordinate.isvalid(data): - return object.__new__(DenseCoordinate) - elif InterpCoordinate.isvalid(data): - return object.__new__(InterpCoordinate) - else: - raise TypeError("could not parse `data`") + for subcls in cls.__subclasses__(): + if subcls.isvalid(data): + return object.__new__(subcls) + raise TypeError("could not parse `data`") def __getitem__(self, item): data = self.data.__getitem__(item) From a9048a7ee5f8cfb0b17eebae06f5342255caca84 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 14:31:44 +0100 Subject: [PATCH 021/104] Refactoring + Coordinate.from_dataset --- xdas/core/coordinates.py | 197 ++++++++++++++++++++++++++------------- xdas/core/dataarray.py | 12 +-- 2 files changed, 136 insertions(+), 73 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index ec864fab..858b88fc 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1,5 +1,6 @@ from copy import copy, deepcopy from functools import wraps +import re import numpy as np import pandas as pd @@ -212,6 +213,10 @@ def from_dict(cls, dct): dct["dims"], ) + @classmethod + def from_dataset(cls, ds, name): + return Coordinate.from_dataset(ds, name) + def copy(self, deep=True): if deep: func = deepcopy @@ -268,6 +273,9 @@ def __len__(self): def __repr__(self): return np.array2string(self.data, threshold=0, edgeitems=1) + def __reduce__(self): + return self.__class__, (self.data, self.dim), {"_parent": self.parent} + def __add__(self, other): return self.__class__(self.data + other, self.dim) @@ -310,6 +318,22 @@ def values(self): def empty(self): return len(self) == 0 + @property + def parent(self): + return getattr(self, "_parent", None) + + @property + def name(self): + if self.parent is None: + return self.dim + return next((name for name in self.parent if self.parent[name] is self), None) + + def isdim(self): + if self.parent is None or self.name is None: + return None + else: + return self.parent.isdim(self.name) + def equals(self, other): ... def to_index(self, item, method=None, endpoint=True): @@ -318,56 +342,6 @@ def to_index(self, item, method=None, endpoint=True): else: return self.get_indexer(item, method) - def isscalar(self): - return isinstance(self, ScalarCoordinate) - - def isdense(self): - return isinstance(self, DenseCoordinate) - - def isinterp(self): - return isinstance(self, InterpCoordinate) - - def append(self, other): - raise NotImplementedError(f"append is not implemented for {self.__class__}") - - def to_dataarray(self): - from .dataarray import DataArray # TODO: avoid defered import? - - if self.name is None: - raise ValueError("cannot convert unnamed coordinate to DataArray") - - if self.parent is None: - return DataArray( - self.values, - {self.dim: self}, - dims=[self.dim], - name=self.name, - ) - else: - return DataArray( - self.values, - { - name: coord - for name, coord in self.parent.items() - if coord.dim == self.dim - }, - dims=[self.dim], - name=self.name, - ) - - def to_dict(self): - raise NotImplementedError - - def to_netcdf(self, ds, attrs): - ds = ds.assign_coords( - {self.name: (self.dim, self.values) if self.dim else self.values} - ) - return ds, attrs - - @classmethod - def from_dict(cls, dct): - return cls(**dct) - def format_index(self, idx, bounds="raise"): idx = np.asarray(idx) if not np.issubdtype(idx.dtype, np.integer): @@ -420,24 +394,63 @@ def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): stop_index -= 1 return slice(start_index, stop_index) - def __reduce__(self): - return self.__class__, (self.data, self.dim), {"_parent": self.parent} + def isscalar(self): + return isinstance(self, ScalarCoordinate) - @property - def parent(self): - return getattr(self, "_parent", None) + def isdense(self): + return isinstance(self, DenseCoordinate) - @property - def name(self): - if self.parent is None: - return self.dim - return next((name for name in self.parent if self.parent[name] is self), None) + def isinterp(self): + return isinstance(self, InterpCoordinate) - def isdim(self): - if self.parent is None or self.name is None: - return None + def append(self, other): + raise NotImplementedError(f"append is not implemented for {self.__class__}") + + def to_dataarray(self): + from .dataarray import DataArray # TODO: avoid defered import? + + if self.name is None: + raise ValueError("cannot convert unnamed coordinate to DataArray") + + if self.parent is None: + return DataArray( + self.values, + {self.dim: self}, + dims=[self.dim], + name=self.name, + ) else: - return self.parent.isdim(self.name) + return DataArray( + self.values, + { + name: coord + for name, coord in self.parent.items() + if coord.dim == self.dim + }, + dims=[self.dim], + name=self.name, + ) + + def to_dict(self): + raise NotImplementedError + + @classmethod + def from_dict(cls, dct): + return cls(**dct) + + def to_dataset(self, ds, attrs): + ds = ds.assign_coords( + {self.name: (self.dim, self.values) if self.dim else self.values} + ) + return ds, attrs + + @classmethod + def from_dataset(cls, ds, name): + coords = {} + for subcls in cls.__subclasses__(): + if hasattr(subcls, "from_dataset"): + coords |= subcls.from_dataset(ds, name) + return coords class ScalarCoordinate(Coordinate): @@ -1017,7 +1030,7 @@ def to_dict(self): } return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - def to_netcdf(self, ds, attrs): + def to_dataset(self, ds, attrs): mapping = f"{self.name}: {self.name}_indices {self.name}_values" if "coordinate_interpolation" in attrs: attrs["coordinate_interpolation"] += " " + mapping @@ -1042,6 +1055,18 @@ def to_netcdf(self, ds, attrs): ) return ds, attrs + @classmethod + def from_dataset(cls, ds, name): + coords = {} + mapping = ds[name].attrs.pop("coordinate_interpolation", None) + if mapping is not None: + matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) + for match in matches: + dim, indices, values = match + data = {"tie_indices": ds[indices], "tie_values": ds[values]} + coords[dim] = Coordinate(data, dim) + return coords + class SampledCoordinate(Coordinate): """ @@ -1396,6 +1421,48 @@ def to_dict(self): } return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + def to_dataset(self, ds, attrs): + mapping = f"{self.name}: {self.name}_values {self.name}_lengths" + if "coordinate_sampling" in attrs: + attrs["coordinate_sampling"] += " " + mapping + else: + attrs["coordinate_sampling"] = mapping + tie_values = ( + self.tie_values.astype("M8[ns]") + if np.issubdtype(self.tie_values.dtype, np.datetime64) + else self.tie_values + ) + tie_lengths = self.tie_lengths + interp_attrs = { + "sampling_interval": self.sampling_interval, + "tie_points_mapping": f"{self.name}_points: {self.name}_values {self.name}_lengths", + } + ds.update( + { + f"{self.name}_sampling": ((), np.nan, interp_attrs), + f"{self.name}_values": (f"{self.name}_points", tie_values), + f"{self.name}_lengths": (f"{self.name}_points", tie_lengths), + } + ) + return ds, attrs + + @classmethod + def from_dataset(cls, dataset, name): + coords = {} + mapping = dataset[name].attrs.pop("coordinate_sampling", None) + if mapping is not None: + matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) + for match in matches: + dim, values, lengths = match + sampling_interval = ... + data = { + "tie_values": dataset[values], + "tie_lengths": dataset[lengths], + "sampling_interval": sampling_interval, + } + coords[dim] = Coordinate(data, dim) + return coords + def parse(data, dim=None): if isinstance(data, tuple): diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 4be0f545..11e4bbb4 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -882,7 +882,7 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): # prepare metadata for coord in self.coords.values(): - ds, variable_attrs = coord.to_netcdf(ds, variable_attrs) + ds, variable_attrs = coord.to_dataset(ds, variable_attrs) # write data with h5netcdf.File(fname, mode=mode) as file: @@ -985,13 +985,9 @@ def from_netcdf(cls, fname, group=None): } # read advanced coordinates - mapping = da.attrs.pop("coordinate_interpolation", None) - if mapping is not None: - matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) - for match in matches: - dim, indices, values = match - data = {"tie_indices": ds[indices], "tie_values": ds[values]} - coords[dim] = Coordinate(data, dim) + coords |= Coordinates.from_dataset(ds, name) + + # read data with h5py.File(fname) as file: if group: file = file[group] From c908381c4c4c871caa0f88c9555a8f96282dad67 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 14:34:56 +0100 Subject: [PATCH 022/104] Refactor: treat regular and advance coordinates with the same API. --- xdas/core/coordinates.py | 18 ++++++++++++++++++ xdas/core/dataarray.py | 21 ++------------------- 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 858b88fc..8277896b 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -645,6 +645,24 @@ def to_dict(self): data = self.data.tolist() return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + @classmethod + def from_dataset(cls, ds, name): + return { + name: ( + ( + coord.dims[0], + ( + coord.values.astype("U") + if coord.dtype == np.dtype("O") + else coord.values + ), + ) + if coord.dims + else coord.values + ) + for name, coord in ds[name].coords.items() + } + class InterpCoordinate(Coordinate): """ diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 11e4bbb4..c5ca93ca 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -967,25 +967,8 @@ def from_netcdf(cls, fname, group=None): else: raise ValueError("several possible data arrays detected") - # read regular coordinates - coords = { - name: ( - ( - coord.dims[0], - ( - coord.values.astype("U") - if coord.dtype == np.dtype("O") - else coord.values - ), - ) - if coord.dims - else coord.values - ) - for name, coord in da.coords.items() - } - - # read advanced coordinates - coords |= Coordinates.from_dataset(ds, name) + # read coordinates + coords = Coordinates.from_dataset(ds, name) # read data with h5py.File(fname) as file: From c2104a737dfa6cc87a5e9b735649286f8bf719d5 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 15:03:50 +0100 Subject: [PATCH 023/104] Refactor: in DataArray.from_netcdf, cleaner data retrieval. --- xdas/core/dataarray.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index c5ca93ca..f9a7cf45 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -955,15 +955,15 @@ def from_netcdf(cls, fname, group=None): # identify the "main" data array if len(ds) == 1: - name, da = next(iter(ds.items())) + name = next(iter(ds.keys())) else: data_vars = { - name: var - for name, var in ds.items() + key: var + for key, var in ds.items() if any("coordinate" in attr for attr in var.attrs) } if len(data_vars) == 1: - name, da = next(iter(data_vars.items())) + name = next(iter(data_vars.keys())) else: raise ValueError("several possible data arrays detected") @@ -971,16 +971,23 @@ def from_netcdf(cls, fname, group=None): coords = Coordinates.from_dataset(ds, name) # read data - with h5py.File(fname) as file: - if group: - file = file[group] - name = "__values__" if da.name is None else da.name - variable = file[name] - if "__dask_array__" in variable.attrs: - data = loads(da.attrs.pop("__dask_array__")) - else: - data = VirtualSource(file[name]) - return cls(data, coords, da.dims, da.name, None if da.attrs == {} else da.attrs) + if "__dask_array__" in ds[name].attrs: + data = loads(ds[name].attrs.pop("__dask_array__")) + else: + with h5py.File(fname) as file: + if group: + file = file[group] + variable = file["__values__" if name is None else name] + data = VirtualSource(variable) + + # pack everything + return cls( + data, + coords, + ds[name].dims, + name, + None if ds[name].attrs == {} else ds[name].attrs, + ) def to_dict(self): """Convert the DataArray to a dictionary.""" From 0dfa3fc5a3adfe3ecd58b3d89cae0452f5369dee Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 15:07:13 +0100 Subject: [PATCH 024/104] Rename ds -> dataset. --- xdas/core/coordinates.py | 36 ++++++++++++++++++------------------ xdas/core/dataarray.py | 28 +++++++++++++++------------- 2 files changed, 33 insertions(+), 31 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 8277896b..c3af298d 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -214,8 +214,8 @@ def from_dict(cls, dct): ) @classmethod - def from_dataset(cls, ds, name): - return Coordinate.from_dataset(ds, name) + def from_dataset(cls, dataset, name): + return Coordinate.from_dataset(dataset, name) def copy(self, deep=True): if deep: @@ -438,18 +438,18 @@ def to_dict(self): def from_dict(cls, dct): return cls(**dct) - def to_dataset(self, ds, attrs): - ds = ds.assign_coords( + def to_dataset(self, dataset, attrs): + dataset = dataset.assign_coords( {self.name: (self.dim, self.values) if self.dim else self.values} ) - return ds, attrs + return dataset, attrs @classmethod - def from_dataset(cls, ds, name): + def from_dataset(cls, dataset, name): coords = {} for subcls in cls.__subclasses__(): if hasattr(subcls, "from_dataset"): - coords |= subcls.from_dataset(ds, name) + coords |= subcls.from_dataset(dataset, name) return coords @@ -646,7 +646,7 @@ def to_dict(self): return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} @classmethod - def from_dataset(cls, ds, name): + def from_dataset(cls, dataset, name): return { name: ( ( @@ -660,7 +660,7 @@ def from_dataset(cls, ds, name): if coord.dims else coord.values ) - for name, coord in ds[name].coords.items() + for name, coord in dataset[name].coords.items() } @@ -1048,7 +1048,7 @@ def to_dict(self): } return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - def to_dataset(self, ds, attrs): + def to_dataset(self, dataset, attrs): mapping = f"{self.name}: {self.name}_indices {self.name}_values" if "coordinate_interpolation" in attrs: attrs["coordinate_interpolation"] += " " + mapping @@ -1064,24 +1064,24 @@ def to_dataset(self, ds, attrs): "interpolation_name": "linear", "tie_points_mapping": f"{self.name}_points: {self.name}_indices {self.name}_values", } - ds.update( + dataset.update( { f"{self.name}_interpolation": ((), np.nan, interp_attrs), f"{self.name}_indices": (f"{self.name}_points", tie_indices), f"{self.name}_values": (f"{self.name}_points", tie_values), } ) - return ds, attrs + return dataset, attrs @classmethod - def from_dataset(cls, ds, name): + def from_dataset(cls, dataset, name): coords = {} - mapping = ds[name].attrs.pop("coordinate_interpolation", None) + mapping = dataset[name].attrs.pop("coordinate_interpolation", None) if mapping is not None: matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) for match in matches: dim, indices, values = match - data = {"tie_indices": ds[indices], "tie_values": ds[values]} + data = {"tie_indices": dataset[indices], "tie_values": dataset[values]} coords[dim] = Coordinate(data, dim) return coords @@ -1439,7 +1439,7 @@ def to_dict(self): } return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - def to_dataset(self, ds, attrs): + def to_dataset(self, dataset, attrs): mapping = f"{self.name}: {self.name}_values {self.name}_lengths" if "coordinate_sampling" in attrs: attrs["coordinate_sampling"] += " " + mapping @@ -1455,14 +1455,14 @@ def to_dataset(self, ds, attrs): "sampling_interval": self.sampling_interval, "tie_points_mapping": f"{self.name}_points: {self.name}_values {self.name}_lengths", } - ds.update( + dataset.update( { f"{self.name}_sampling": ((), np.nan, interp_attrs), f"{self.name}_values": (f"{self.name}_points", tie_values), f"{self.name}_lengths": (f"{self.name}_points", tie_lengths), } ) - return ds, attrs + return dataset, attrs @classmethod def from_dataset(cls, dataset, name): diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index f9a7cf45..e3ba7a9a 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -876,13 +876,13 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): virtual = isinstance(self.data, (VirtualArray, DaskArray)) # initialize - ds = xr.Dataset(attrs={"Conventions": "CF-1.9"}) + dataset = xr.Dataset(attrs={"Conventions": "CF-1.9"}) variable_attrs = {} if self.attrs is None else self.attrs variable_name = "__values__" if self.name is None else self.name # prepare metadata for coord in self.coords.values(): - ds, variable_attrs = coord.to_dataset(ds, variable_attrs) + dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) # write data with h5netcdf.File(fname, mode=mode) as file: @@ -925,7 +925,7 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): variable.attrs.update(variable_attrs) # write metadata - ds.to_netcdf(fname, mode="a", group=group, engine="h5netcdf") + dataset.to_netcdf(fname, mode="a", group=group, engine="h5netcdf") @classmethod def from_netcdf(cls, fname, group=None): @@ -945,21 +945,23 @@ def from_netcdf(cls, fname, group=None): The openend data array. """ # read metadata - with xr.open_dataset(fname, group=group, engine="h5netcdf") as ds: + with xr.open_dataset(fname, group=group, engine="h5netcdf") as dataset: # check file format - if not ("Conventions" in ds.attrs and "CF" in ds.attrs["Conventions"]): + if not ( + "Conventions" in dataset.attrs and "CF" in dataset.attrs["Conventions"] + ): raise TypeError( "file format not recognized. please provide the file format " "with the `engine` keyword argument" ) # identify the "main" data array - if len(ds) == 1: - name = next(iter(ds.keys())) + if len(dataset) == 1: + name = next(iter(dataset.keys())) else: data_vars = { key: var - for key, var in ds.items() + for key, var in dataset.items() if any("coordinate" in attr for attr in var.attrs) } if len(data_vars) == 1: @@ -968,11 +970,11 @@ def from_netcdf(cls, fname, group=None): raise ValueError("several possible data arrays detected") # read coordinates - coords = Coordinates.from_dataset(ds, name) + coords = Coordinates.from_dataset(dataset, name) # read data - if "__dask_array__" in ds[name].attrs: - data = loads(ds[name].attrs.pop("__dask_array__")) + if "__dask_array__" in dataset[name].attrs: + data = loads(dataset[name].attrs.pop("__dask_array__")) else: with h5py.File(fname) as file: if group: @@ -984,9 +986,9 @@ def from_netcdf(cls, fname, group=None): return cls( data, coords, - ds[name].dims, + dataset[name].dims, name, - None if ds[name].attrs == {} else ds[name].attrs, + None if dataset[name].attrs == {} else dataset[name].attrs, ) def to_dict(self): From 5ca84454a1c3945ed917ed039cd1bf4003aec80b Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 15:47:18 +0100 Subject: [PATCH 025/104] Implement decimation method in SampledCoordinate class --- xdas/core/coordinates.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index c3af298d..6fee083c 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1406,7 +1406,14 @@ def append(self, other): ) def decimate(self, q): - raise NotImplementedError("decimation is not implemented for SampledCoordinate") + return self.__class__( + { + "tie_values": self.tie_values, + "tie_lengths": (self.tie_lengths + q - 1) // q, + "sampling_interval": self.sampling_interval * q, + }, + self.dim, + ) def simplify(self, tolerance=None): raise NotImplementedError( From 680bf9088ad665809356ff6100c80fbb7ca9a01b Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 15:47:33 +0100 Subject: [PATCH 026/104] Implement simplification method in SampledCoordinate class --- xdas/core/coordinates.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 6fee083c..30eb0fee 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1416,8 +1416,22 @@ def decimate(self, q): ) def simplify(self, tolerance=None): - raise NotImplementedError( - "simplification is not implemented for SampledCoordinate" + tie_values = [self.tie_values[0]] + tie_lengths = [self.tie_lengths[0]] + for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): + delta = value - (tie_values[-1] + self.sampling_interval * tie_lengths[-1]) + if np.abs(delta) <= tolerance: + tie_lengths[-1] += length + else: + tie_values.append(value) + tie_lengths.append(length) + return self.__class__( + { + "tie_values": np.array(tie_values), + "tie_lengths": np.array(tie_lengths), + "sampling_interval": self.sampling_interval, + }, + self.dim, ) def get_discontinuities(self): From 25ae329f4cb82c00c39091529ea754c7af87712b Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 15:48:24 +0100 Subject: [PATCH 027/104] Implement get_discontinuities method in SampledCoordinate class --- xdas/core/coordinates.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 30eb0fee..ee1d0a30 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1435,9 +1435,34 @@ def simplify(self, tolerance=None): ) def get_discontinuities(self): - raise NotImplementedError( - "get_discontinuities is not implemented for SampledCoordinate" - ) + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + records = [] + for index in self.tie_indices[: -1]: + start_index = index + end_index = index + 1 + start_value = self.get_value(index) + end_value = self.get_value(index + 1) + record = { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": ("gap" if end_value > start_value else "overlap"), + } + records.append(record) + return pd.DataFrame.from_records(records) + def get_availabilities(self): raise NotImplementedError( From 258a5ae771fa713f8f794c95b19b63f6684cc5c1 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 15:48:53 +0100 Subject: [PATCH 028/104] Implement get_availabilities method in SampledCoordinate class --- xdas/core/coordinates.py | 31 ++++++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index ee1d0a30..e712db02 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1465,9 +1465,34 @@ def get_discontinuities(self): def get_availabilities(self): - raise NotImplementedError( - "get_availabilities is not implemented for SampledCoordinate" - ) + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + records = [] + for index, value, length in zip(self.tie_indices, self.tie_values, self.tie_indices): + start_index = index + end_index = index + length - 1 + start_value = value + end_value = value + self.sampling_interval * (length - 1) + records.append( + { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": "data", + } + ) + return pd.DataFrame.from_records(records) @classmethod def from_array(cls, arr, dim=None, sampling_interval=None): From 492c2e57ef2b19a31b8c2fd9895c9cd7527b7bd3 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 17 Dec 2025 15:49:15 +0100 Subject: [PATCH 029/104] format code --- xdas/core/coordinates.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index e712db02..65e7c0a9 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1447,8 +1447,8 @@ def get_discontinuities(self): ] ) records = [] - for index in self.tie_indices[: -1]: - start_index = index + for index in self.tie_indices[:-1]: + start_index = index end_index = index + 1 start_value = self.get_value(index) end_value = self.get_value(index + 1) @@ -1463,7 +1463,6 @@ def get_discontinuities(self): records.append(record) return pd.DataFrame.from_records(records) - def get_availabilities(self): if self.empty: return pd.DataFrame( @@ -1477,7 +1476,9 @@ def get_availabilities(self): ] ) records = [] - for index, value, length in zip(self.tie_indices, self.tie_values, self.tie_indices): + for index, value, length in zip( + self.tie_indices, self.tie_values, self.tie_indices + ): start_index = index end_index = index + length - 1 start_value = value From 0ecb7f84cef4b3c9870ac99ec4cb1f7ca86cce65 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 09:59:19 +0100 Subject: [PATCH 030/104] Fix and test SampledCoordinate slicing. --- tests/test_sampled_coordinate.py | 267 +++++++++++++++++++++++++++++++ xdas/core/coordinates.py | 143 ++++++++--------- 2 files changed, 330 insertions(+), 80 deletions(-) create mode 100644 tests/test_sampled_coordinate.py diff --git a/tests/test_sampled_coordinate.py b/tests/test_sampled_coordinate.py new file mode 100644 index 00000000..d914a4c2 --- /dev/null +++ b/tests/test_sampled_coordinate.py @@ -0,0 +1,267 @@ +import numpy as np +import pytest + +from xdas.core.coordinates import SampledCoordinate, ScalarCoordinate, DenseCoordinate + + +class TestSampledCoordinateBasics: + def test_isvalid(self): + assert SampledCoordinate.isvalid( + {"tie_values": [0.0], "tie_lengths": [1], "sampling_interval": 1.0} + ) + assert SampledCoordinate.isvalid( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [1], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + assert not SampledCoordinate.isvalid({"tie_values": [0.0], "tie_lengths": [1]}) + assert not SampledCoordinate.isvalid({}) + + def test_init_and_empty(self): + empty = SampledCoordinate() + assert empty.empty + assert len(empty) == 0 + assert empty.dtype is not None + assert empty.shape == (0,) + assert empty.ndim == 1 + assert empty.values.size == 0 + + def test_init_validation_numeric(self): + # valid numeric + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + assert len(coord) == 3 + assert coord.start == 0.0 + assert coord.end == 3.0 + + # mismatched lengths + with pytest.raises(ValueError): + SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [3], + "sampling_interval": 1.0, + } + ) + # non-integer lengths + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [1.5], "sampling_interval": 1.0} + ) + # non-positive lengths + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [0], "sampling_interval": 1.0} + ) + # sampling interval must be scalar + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": [1.0]} + ) + + def test_init_validation_datetime(self): + # valid datetime with timedelta sampling interval + t0 = np.datetime64("2000-01-01T00:00:00") + coord = SampledCoordinate( + { + "tie_values": [t0], + "tie_lengths": [2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + assert coord.start == t0 + assert coord.end == t0 + np.timedelta64(2, "s") + + # invalid: datetime with numeric sampling interval + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": [t0], "tie_lengths": [2], "sampling_interval": 1} + ) + + +class TestSampledCoordinateIndexing: + def make_coord(self): + # Two segments: [0,1,2] and [10,11] + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_len_indices_values(self): + coord = self.make_coord() + assert len(coord) == 5 + assert np.array_equal(coord.indices, np.arange(5)) + assert np.array_equal(coord.values, np.array([0.0, 1.0, 2.0, 10.0, 11.0])) + + def test_get_value_scalar_and_vector(self): + coord = self.make_coord() + # scalar + assert coord.get_value(0) == 0.0 + assert coord.get_value(1) == 1.0 + assert coord.get_value(2) == 2.0 + assert coord.get_value(3) == 10.0 + assert coord.get_value(4) == 11.0 + # vectorized + vals = coord.get_value([0, 2, 3, 4]) + assert np.array_equal(vals, np.array([0.0, 2.0, 10.0, 11.0])) + # bounds + with pytest.raises(IndexError): + coord.get_value(-6) + with pytest.raises(IndexError): + coord.get_value(5) + + def test_getitem(self): + coord = self.make_coord() + # scalar -> ScalarCoordinate + item = coord[1] + assert isinstance(item, ScalarCoordinate) + assert item.values == 1.0 + # slice -> SampledCoordinate or compatible + sub = coord[1:4] + assert isinstance(sub, SampledCoordinate) + # array -> DenseCoordinate of values + arr = coord[[0, 4]] + assert isinstance(arr, DenseCoordinate) + assert np.array_equal(arr.values, np.array([0.0, 11.0])) + + def test_repr(self): + # Just ensure it returns a string + coord = self.make_coord() + assert isinstance(repr(coord), str) + + +class TestSampledCoordinateSlicing: + def make_coord(self): + # Two segments: [0,1,2] and [10,11] + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_slice_within_segment(self): + coord = self.make_coord() + sliced = coord[0:2] + assert isinstance(sliced, SampledCoordinate) + assert len(sliced) == 2 + assert np.array_equal(sliced.values, np.array([0.0, 1.0])) + + def test_slice_cross_segments(self): + coord = self.make_coord() + sliced = coord[1:4] + assert isinstance(sliced, SampledCoordinate) + assert len(sliced) == 3 + assert np.array_equal(sliced.values, np.array([1.0, 2.0, 10.0])) + + def test_slice_full(self): + coord = self.make_coord() + sliced = coord[:] + assert sliced.equals(coord) + + +class TestSampledCoordinateAppend: + def test_append_two_coords(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 1.0} + ) + result = coord1.append(coord2) + assert len(result) == 5 + assert result.tie_values[0] == 0.0 + assert result.tie_values[1] == 10.0 + + def test_append_empty(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate() + assert coord1.append(coord2).equals(coord1) + assert coord2.append(coord1).equals(coord1) + + +class TestSampledCoordinateDecimate: + def test_decimate(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [10], "sampling_interval": 1.0} + ) + decimated = coord.decimate(2) + assert decimated.sampling_interval == 2.0 + assert decimated.tie_lengths[0] == 5 # (10 + 2 - 1) // 2 = 5 + + +class TestSampledCoordinateSimplify: + def test_simplify_continuous(self): + # Two continuous segments should merge + coord = SampledCoordinate( + { + "tie_values": [0.0, 3.0], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + } + ) + simplified = coord.simplify(tolerance=0.1) + # If continuous (end of first == start of second), should merge + assert len(simplified.tie_values) <= 2 + + +class TestSampledCoordinateGetIndexer: + def make_coord(self): + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_get_indexer_exact(self): + coord = self.make_coord() + idx = coord.get_indexer(0.0, method="nearest") + assert idx == 0 + idx = coord.get_indexer(10.0, method="nearest") + assert idx == 3 + + def test_get_indexer_nearest(self): + coord = self.make_coord() + idx = coord.get_indexer(0.5, method="nearest") + assert idx in [0, 1] + + def test_get_indexer_out_of_bounds(self): + coord = self.make_coord() + with pytest.raises(KeyError): + coord.get_indexer(100.0) + + +class TestSampledCoordinateArithmetic: + def test_add(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + result = coord + 10.0 + assert result.tie_values[0] == 10.0 + assert np.array_equal(result.values, np.array([10.0, 11.0, 12.0])) + + def test_sub(self): + coord = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + result = coord - 5.0 + assert result.tie_values[0] == 5.0 + assert np.array_equal(result.values, np.array([5.0, 6.0, 7.0])) + + +class TestSampledCoordinateSerialization: + def test_to_from_dict(self): + coord = SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + }, + dim="time", + ) + d = coord.to_dict() + # round-trip via Coordinate factory + from xdas.core.coordinates import Coordinate + + back = Coordinate.from_dict(d) + assert isinstance(back, SampledCoordinate) + assert back.equals(coord) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 65e7c0a9..fb004194 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -364,6 +364,8 @@ def format_index_slice(self, slc): stop = len(self) if step is None: step = 1 + if step <= 0: + raise NotImplementedError("negative or zero step when slicing is not supported yet") start = self.format_index(start, bounds="clip") stop = self.format_index(stop, bounds="clip") return slice(start, stop, step) @@ -1120,7 +1122,7 @@ def __init__(self, data=None, dim=None, dtype=None): ) tie_values = np.asarray(data["tie_values"], dtype=dtype) tie_lengths = np.asarray(data["tie_lengths"]) - sampling_interval = np.asarray(data["sampling_interval"]) + sampling_interval = data["sampling_interval"] # check shapes if not tie_values.ndim == 1: @@ -1146,17 +1148,35 @@ def __init__(self, data=None, dim=None, dtype=None): if not np.isscalar(sampling_interval): raise ValueError("`sampling_interval` must be a scalar value") if np.issubdtype(tie_values.dtype, np.datetime64): - if not np.issubdtype(sampling_interval.dtype, np.timedelta64): + if not np.issubdtype(np.asarray(sampling_interval).dtype, np.timedelta64): raise ValueError( "`sampling_interval` must be timedelta64 for datetime64 `tie_values`" ) - self.data = dict( - tie_values=tie_values, - tie_lengths=tie_lengths, - sampling_interval=sampling_interval, - ) + + # store data + self.data = { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": sampling_interval, + } self.dim = dim + @property + def tie_values(self): + return self.data["tie_values"] + + @property + def tie_lengths(self): + return self.data["tie_lengths"] + + @property + def sampling_interval(self): + return self.data["sampling_interval"] + + @property + def dtype(self): + return self.tie_values.dtype + @staticmethod def isvalid(data): match data: @@ -1184,9 +1204,9 @@ def __repr__(self): if np.issubdtype(self.dtype, np.floating): return f"{self.start:.3f} to {self.end:.3f}" elif np.issubdtype(self.dtype, np.datetime64): - self.start = format_datetime(self.start) - self.end = format_datetime(self.end) - return f"{self.start} to {self.end}" + start_str = format_datetime(self.start) + end_str = format_datetime(self.end) + return f"{start_str} to {end_str}" else: return f"{self.start} to {self.end}" @@ -1230,18 +1250,6 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def __array_function__(self, func, types, args, kwargs): raise NotImplementedError - @property - def tie_values(self): - return self.data["tie_values"] - - @property - def tie_lengths(self): - return self.data["tie_lengths"] - - @property - def sampling_interval(self): - return self.data["sampling_interval"] - @property def tie_indices(self): return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) @@ -1250,10 +1258,6 @@ def tie_indices(self): def empty(self): return self.tie_values.shape == (0,) - @property - def dtype(self): - return self.tie_values.dtype - @property def ndim(self): return self.tie_values.ndim @@ -1297,65 +1301,40 @@ def get_value(self, index): index = self.format_index(index) if np.any(index < 0) or np.any(index >= len(self)): raise IndexError("index is out of bounds") - reference = np.searchsorted(self.tie_indices, index) + reference = np.searchsorted(self.tie_indices, index, side="right") - 1 return self.tie_values[reference] + ( (index - self.tie_indices[reference]) * self.sampling_interval ) def slice_index(self, index_slice): index_slice = self.format_index_slice(index_slice) - start_index, stop_index, step_index = ( - index_slice.start, - index_slice.stop, - index_slice.step, - ) - if stop_index - start_index <= 0: - return self.__class__( - dict( - tie_values=[], - tie_lengths=[], - sampling_interval=self.sampling_interval, - ), - self.dim, - ) - elif (stop_index - start_index) <= step_index: - tie_values = [self.get_value(start_index)] - tie_lengths = [stop_index - start_index] - return self.__class__( - dict( - tie_values=tie_values, - tie_lengths=tie_lengths, - sampling_interval=self.sampling_interval, - ), - self.dim, - ) - else: - # keep tie values, number of samples and related tie indices contained in the slice - mask = (start_index < self.tie_indices) & (self.tie_indices <= stop_index) - tie_values = self.tie_values[mask] - tie_lengths = self.tie_lengths[mask] - tie_indices = self.tie_indices[mask] - # insert the missing start value - start_value = self.get_value(start_index) - tie_values = np.concatenate([[start_value], self.tie_values[mask]]) + # get indices relative to tie points + relative_start_index = np.clip(index_slice.start - self.tie_indices, 0, self.tie_lengths) + relative_stop_index = np.clip(index_slice.stop - self.tie_indices, 0, self.tie_lengths) - # insert the missing start number of samples and adjust the end one - tie_lengths = np.concatenate( - [[start_index - tie_indices[0]], tie_lengths[mask]] - ) - tie_lengths[-1] = stop_index - tie_indices[-1] + # keep segments with data + mask = relative_start_index < relative_stop_index + + # compute new tie points ane lengths + tie_values = self.tie_values[mask] + relative_start_index[mask] * self.sampling_interval + tie_lengths = relative_stop_index[mask] - relative_start_index[mask] + + # adjust for step if needed + if index_slice.step == 1: + sampling_interval = self.sampling_interval + else: + tie_lengths = (self.tie_lengths + index_slice.step - 1) // index_slice.step, + sampling_interval = self.sampling_interval * index_slice.step + + # build new coordinate + data = { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": sampling_interval, + } + return self.__class__(data, self.dim) - # repack data and decimate if needed - data = { - "tie_values": tie_values, - "tie_lengths": tie_lengths, - "sampling_interval": self.sampling_interval, - } - coord = self.__class__(data, self.dim) - if step_index != 1: - coord = coord.decimate(step_index) - return coord def get_indexer(self, value, method=None): if isinstance(value, str): @@ -1367,16 +1346,20 @@ def get_indexer(self, value, method=None): raise KeyError("index not found") if not is_strictly_increasing(self.tie_values): raise ValueError("tie_values must be strictly increasing") - reference = np.searchsorted(self.tie_values, value) + reference = np.searchsorted(self.tie_values, value, side="right") - 1 offset = (value - self.tie_values[reference]) / self.sampling_interval - if method == "nearest": + if method is None: + if np.any(offset % 1 != 0): + raise KeyError("index not found") + offset = offset.astype(int) + elif method == "nearest": offset = np.round(offset).astype(int) elif method == "ffill": offset = np.floor(offset).astype(int) elif method == "bfill": offset = np.ceil(offset).astype(int) else: - raise ValueError("method must be one of 'nearest', 'ffill', or 'bfill'") + raise ValueError("method must be one of `None`, 'nearest', 'ffill', or 'bfill'") return self.tie_indices[reference] + offset def append(self, other): @@ -1395,7 +1378,7 @@ def append(self, other): "cannot append coordinate with different sampling intervals" ) tie_values = np.concatenate([self.tie_values, other.tie_values]) - tie_lengths = np.concatenate([self.tie_lengths, other.tie_lengths + len(self)]) + tie_lengths = np.concatenate([self.tie_lengths, other.tie_lengths]) return self.__class__( { "tie_values": tie_values, From 57e554a96cb2d5a1d25985ae2b96d9ea56f06a14 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 10:01:23 +0100 Subject: [PATCH 031/104] minor refactoring. --- xdas/core/coordinates.py | 52 +++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index fb004194..916af7ad 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -365,7 +365,9 @@ def format_index_slice(self, slc): if step is None: step = 1 if step <= 0: - raise NotImplementedError("negative or zero step when slicing is not supported yet") + raise NotImplementedError( + "negative or zero step when slicing is not supported yet" + ) start = self.format_index(start, bounds="clip") stop = self.format_index(stop, bounds="clip") return slice(start, stop, step) @@ -1148,7 +1150,9 @@ def __init__(self, data=None, dim=None, dtype=None): if not np.isscalar(sampling_interval): raise ValueError("`sampling_interval` must be a scalar value") if np.issubdtype(tie_values.dtype, np.datetime64): - if not np.issubdtype(np.asarray(sampling_interval).dtype, np.timedelta64): + if not np.issubdtype( + np.asarray(sampling_interval).dtype, np.timedelta64 + ): raise ValueError( "`sampling_interval` must be timedelta64 for datetime64 `tie_values`" ) @@ -1310,21 +1314,29 @@ def slice_index(self, index_slice): index_slice = self.format_index_slice(index_slice) # get indices relative to tie points - relative_start_index = np.clip(index_slice.start - self.tie_indices, 0, self.tie_lengths) - relative_stop_index = np.clip(index_slice.stop - self.tie_indices, 0, self.tie_lengths) + relative_start_index = np.clip( + index_slice.start - self.tie_indices, 0, self.tie_lengths + ) + relative_stop_index = np.clip( + index_slice.stop - self.tie_indices, 0, self.tie_lengths + ) # keep segments with data mask = relative_start_index < relative_stop_index # compute new tie points ane lengths - tie_values = self.tie_values[mask] + relative_start_index[mask] * self.sampling_interval + tie_values = ( + self.tie_values[mask] + relative_start_index[mask] * self.sampling_interval + ) tie_lengths = relative_stop_index[mask] - relative_start_index[mask] # adjust for step if needed if index_slice.step == 1: sampling_interval = self.sampling_interval else: - tie_lengths = (self.tie_lengths + index_slice.step - 1) // index_slice.step, + tie_lengths = ( + (self.tie_lengths + index_slice.step - 1) // index_slice.step, + ) sampling_interval = self.sampling_interval * index_slice.step # build new coordinate @@ -1335,7 +1347,6 @@ def slice_index(self, index_slice): } return self.__class__(data, self.dim) - def get_indexer(self, value, method=None): if isinstance(value, str): value = np.datetime64(value) @@ -1348,18 +1359,21 @@ def get_indexer(self, value, method=None): raise ValueError("tie_values must be strictly increasing") reference = np.searchsorted(self.tie_values, value, side="right") - 1 offset = (value - self.tie_values[reference]) / self.sampling_interval - if method is None: - if np.any(offset % 1 != 0): - raise KeyError("index not found") - offset = offset.astype(int) - elif method == "nearest": - offset = np.round(offset).astype(int) - elif method == "ffill": - offset = np.floor(offset).astype(int) - elif method == "bfill": - offset = np.ceil(offset).astype(int) - else: - raise ValueError("method must be one of `None`, 'nearest', 'ffill', or 'bfill'") + match method: + case None: + if np.any(offset % 1 != 0): + raise KeyError("index not found") + offset = offset.astype(int) + case "nearest": + offset = np.round(offset).astype(int) + case "ffill": + offset = np.floor(offset).astype(int) + case "bfill": + offset = np.ceil(offset).astype(int) + case _: + raise ValueError( + "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" + ) return self.tie_indices[reference] + offset def append(self, other): From c310b6e6e38667d2cb6fb3a528a2af09e23311d8 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 11:17:14 +0100 Subject: [PATCH 032/104] Fix SampleCoordinate stepped slicing. Add tests. --- tests/test_sampled_coordinate.py | 95 ++++++++++++++++++++++++++++++++ xdas/core/coordinates.py | 15 ++--- 2 files changed, 99 insertions(+), 11 deletions(-) diff --git a/tests/test_sampled_coordinate.py b/tests/test_sampled_coordinate.py index d914a4c2..610f5d4a 100644 --- a/tests/test_sampled_coordinate.py +++ b/tests/test_sampled_coordinate.py @@ -2,6 +2,7 @@ import pytest from xdas.core.coordinates import SampledCoordinate, ScalarCoordinate, DenseCoordinate +import pandas as pd class TestSampledCoordinateBasics: @@ -132,6 +133,100 @@ def test_repr(self): assert isinstance(repr(coord), str) +class TestSampledCoordinateSliceEdgeCases: + def make_coord(self): + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + + def test_slice_negative_and_out_of_bounds(self): + coord = self.make_coord() + # negative slice indices + s = coord[-4:-1] + assert isinstance(s, SampledCoordinate) + # slice that extends beyond bounds should clip + s2 = coord[-10:10] + assert s2.equals(coord) + + def test_slice_step_decimate(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [10], "sampling_interval": 1.0} + ) + stepped = coord[::2] + decimated = coord.decimate(2) + assert isinstance(stepped, SampledCoordinate) + assert decimated.equals(stepped) + + +class TestSampledCoordinateAppendErrors: + def test_append_sampling_interval_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 2.0} + ) + with pytest.raises(ValueError): + coord1.append(coord2) + + def test_append_dtype_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [1], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + with pytest.raises(ValueError): + coord1.append(coord2) + + +class TestSampledCoordinateDiscontinuitiesAvailabilities: + def test_discontinuities_and_availabilities(self): + # tie_lengths set to create 2 segments + coord = SampledCoordinate( + {"tie_values": [0.0, 5.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + dis = coord.get_discontinuities() + avail = coord.get_availabilities() + # expect DataFrame with specific columns + for df in (dis, avail): + assert isinstance(df, pd.DataFrame) + assert set(df.columns) >= { + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + } + # availabilities should list segments (2 segments -> 2 records) + assert len(avail) >= 1 + + +class TestSampledCoordinateToDatasetAndDict: + def test_to_dict_contains_expected_keys(self): + coord = SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + }, + dim="time", + ) + d = coord.to_dict() + assert "dim" in d + assert "data" in d + assert set(d["data"].keys()) >= { + "tie_values", + "tie_lengths", + "sampling_interval", + } + + class TestSampledCoordinateSlicing: def make_coord(self): # Two segments: [0,1,2] and [10,11] diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index 916af7ad..eb3ad691 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1313,6 +1313,8 @@ def get_value(self, index): def slice_index(self, index_slice): index_slice = self.format_index_slice(index_slice) + # TODO: optimize when start and/or stop are None + # get indices relative to tie points relative_start_index = np.clip( index_slice.start - self.tie_indices, 0, self.tie_lengths @@ -1334,9 +1336,7 @@ def slice_index(self, index_slice): if index_slice.step == 1: sampling_interval = self.sampling_interval else: - tie_lengths = ( - (self.tie_lengths + index_slice.step - 1) // index_slice.step, - ) + tie_lengths = (self.tie_lengths + index_slice.step - 1) // index_slice.step sampling_interval = self.sampling_interval * index_slice.step # build new coordinate @@ -1403,14 +1403,7 @@ def append(self, other): ) def decimate(self, q): - return self.__class__( - { - "tie_values": self.tie_values, - "tie_lengths": (self.tie_lengths + q - 1) // q, - "sampling_interval": self.sampling_interval * q, - }, - self.dim, - ) + return self[::q] def simplify(self, tolerance=None): tie_values = [self.tie_values[0]] From 9b36f358e3e2511a318e44821b942d4517e618ee Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 11:28:32 +0100 Subject: [PATCH 033/104] Fix bouds checks for SampledCooridnate. Add tests. --- tests/test_sampled_coordinate.py | 65 ++++++++++++++++++++++++++++++++ xdas/core/coordinates.py | 2 +- 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/tests/test_sampled_coordinate.py b/tests/test_sampled_coordinate.py index 610f5d4a..4645c3a7 100644 --- a/tests/test_sampled_coordinate.py +++ b/tests/test_sampled_coordinate.py @@ -360,3 +360,68 @@ def test_to_from_dict(self): back = Coordinate.from_dict(d) assert isinstance(back, SampledCoordinate) assert back.equals(coord) + + +class TestSampledCoordinateDatetime: + def make_dt_coord(self): + t0 = np.datetime64("2000-01-01T00:00:00") + return SampledCoordinate( + { + "tie_values": [t0, t0 + np.timedelta64(10, "s")], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + + def test_datetime_values_and_dtype(self): + coord = self.make_dt_coord() + assert np.issubdtype(coord.dtype, np.datetime64) + vals = coord.values + assert np.issubdtype(vals.dtype, np.datetime64) + assert vals[0] == np.datetime64("2000-01-01T00:00:00") + assert vals[3] == np.datetime64("2000-01-01T00:00:10") + + def test_get_value_datetime(self): + coord = self.make_dt_coord() + assert coord.get_value(1) == np.datetime64("2000-01-01T00:00:01") + assert coord.get_value(4) == np.datetime64("2000-01-01T00:00:11") + with pytest.raises(IndexError): + coord.get_value(5) + + def test_get_indexer_datetime_methods(self): + coord = self.make_dt_coord() + t = np.datetime64("2000-01-01T00:00:01.500") + # exact required when method=None -> should raise + with pytest.raises(KeyError): + coord.get_indexer(t) + # method variants + assert coord.get_indexer(t, method="nearest") in [1, 2] + assert coord.get_indexer(t, method="ffill") == 1 + assert coord.get_indexer(t, method="bfill") == 2 + # bounds + with pytest.raises(KeyError): + coord.get_indexer(np.datetime64("1999-12-31T23:59:59")) + with pytest.raises(KeyError): + coord.get_indexer(np.datetime64("2000-01-01T00:00:12")) + + def test_start_end_properties_datetime(self): + coord = self.make_dt_coord() + assert coord.start == np.datetime64("2000-01-01T00:00:00") + # end is last tie_value + sampling_interval * last_length + assert coord.end == np.datetime64("2000-01-01T00:00:12") + + +class TestSampledCoordinateIndexerEdgeCases: + def test_invalid_method_raises(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + with pytest.raises(ValueError): + coord.get_indexer(0.0, method="bad") + + def test_non_increasing_tie_values_raises(self): + coord = SampledCoordinate( + {"tie_values": [2.0, 1.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + with pytest.raises(ValueError): + coord.get_indexer(2.0) diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates.py index eb3ad691..846b5b8c 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates.py @@ -1353,7 +1353,7 @@ def get_indexer(self, value, method=None): else: value = np.asarray(value) # Check that value lies within the coordinate value range (vectorized) - if np.any(value < self.start) or np.any(value > self.end): + if np.any(value < self.start) or np.any(value >= self.end): raise KeyError("index not found") if not is_strictly_increasing(self.tie_values): raise ValueError("tie_values must be strictly increasing") From 61e71c77abaf11b4bd4e49a9116ce45d63529159 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 11:54:36 +0100 Subject: [PATCH 034/104] Refactoring: WIP one module per coordinate class. --- tests/test_sampled_coordinate.py | 4 +- xdas/core/coordinates/__init__.py | 10 + .../{coordinates.py => coordinates/core.py} | 475 +----------------- xdas/core/coordinates/interp.py | 454 +++++++++++++++++ 4 files changed, 490 insertions(+), 453 deletions(-) create mode 100644 xdas/core/coordinates/__init__.py rename xdas/core/{coordinates.py => coordinates/core.py} (70%) create mode 100644 xdas/core/coordinates/interp.py diff --git a/tests/test_sampled_coordinate.py b/tests/test_sampled_coordinate.py index 4645c3a7..643485dc 100644 --- a/tests/test_sampled_coordinate.py +++ b/tests/test_sampled_coordinate.py @@ -1,8 +1,8 @@ import numpy as np +import pandas as pd import pytest -from xdas.core.coordinates import SampledCoordinate, ScalarCoordinate, DenseCoordinate -import pandas as pd +from xdas.core.coordinates import DenseCoordinate, SampledCoordinate, ScalarCoordinate class TestSampledCoordinateBasics: diff --git a/xdas/core/coordinates/__init__.py b/xdas/core/coordinates/__init__.py new file mode 100644 index 00000000..49f17258 --- /dev/null +++ b/xdas/core/coordinates/__init__.py @@ -0,0 +1,10 @@ +from .core import ( + Coordinate, + Coordinates, + DefaultCoordinate, + DenseCoordinate, + SampledCoordinate, + ScalarCoordinate, + get_sampling_interval, +) +from .interp import InterpCoordinate diff --git a/xdas/core/coordinates.py b/xdas/core/coordinates/core.py similarity index 70% rename from xdas/core/coordinates.py rename to xdas/core/coordinates/core.py index 846b5b8c..ec4dea16 100644 --- a/xdas/core/coordinates.py +++ b/xdas/core/coordinates/core.py @@ -1,10 +1,9 @@ +import re from copy import copy, deepcopy from functools import wraps -import re import numpy as np import pandas as pd -from xinterp import forward, inverse def wraps_first_last(func): @@ -399,19 +398,25 @@ def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): return slice(start_index, stop_index) def isscalar(self): - return isinstance(self, ScalarCoordinate) + return False + + def isdefault(self): + return False def isdense(self): - return isinstance(self, DenseCoordinate) + return False def isinterp(self): - return isinstance(self, InterpCoordinate) + return False + + def issampled(self): + return False def append(self, other): raise NotImplementedError(f"append is not implemented for {self.__class__}") def to_dataarray(self): - from .dataarray import DataArray # TODO: avoid defered import? + from ..dataarray import DataArray # TODO: avoid defered import? if self.name is None: raise ValueError("cannot convert unnamed coordinate to DataArray") @@ -485,6 +490,9 @@ def isvalid(data): data = np.asarray(data) return (data.dtype != np.dtype(object)) and (data.ndim == 0) + def isscalar(self): + return True + def equals(self, other): if isinstance(other, self.__class__): return self.data == other.data @@ -541,6 +549,9 @@ def isvalid(data): case _: return False + def isdefault(self): + return True + @property def empty(self): return bool(self.data["size"]) @@ -596,6 +607,9 @@ def isvalid(data): data = np.asarray(data) return (data.dtype != np.dtype(object)) and (data.ndim == 1) + def isdense(self): + return True + @property def index(self): return pd.Index(self.data) @@ -668,428 +682,6 @@ def from_dataset(cls, dataset, name): } -class InterpCoordinate(Coordinate): - """ - Array-like object used to represent piecewise evenly spaced coordinates using the - CF convention. - - The coordinate ticks are describes by the mean of tie points that are interpolated - when intermediate values are required. Coordinate objects provides label based - selections methods. - - Parameters - ---------- - tie_indices : sequence of integers - The indices of the tie points. Must include index 0 and be strictly increasing. - tie_values : sequence of float or datetime64 - The values of the tie points. Must be strictly increasing to enable label-based - selection. The len of `tie_indices` and `tie_values` sizes must match. - """ - - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, data=None, dim=None, dtype=None): - if data is None: - data = {"tie_indices": [], "tie_values": []} - data, dim = parse(data, dim) - if not self.__class__.isvalid(data): - raise TypeError("`data` must be dict-like") - if not set(data) == {"tie_indices", "tie_values"}: - raise ValueError( - "both `tie_indices` and `tie_values` key should be provided" - ) - tie_indices = np.asarray(data["tie_indices"]) - tie_values = np.asarray(data["tie_values"], dtype=dtype) - if not tie_indices.ndim == 1: - raise ValueError("`tie_indices` must be 1D") - if not tie_values.ndim == 1: - raise ValueError("`tie_values` must be 1D") - if not len(tie_indices) == len(tie_values): - raise ValueError("`tie_indices` and `tie_values` must have the same length") - if not tie_indices.shape == (0,): - if not np.issubdtype(tie_indices.dtype, np.integer): - raise ValueError("`tie_indices` must be integer-like") - if not tie_indices[0] == 0: - raise ValueError("`tie_indices` must start with a zero") - if not is_strictly_increasing(tie_indices): - raise ValueError("`tie_indices` must be strictly increasing") - if not ( - np.issubdtype(tie_values.dtype, np.number) - or np.issubdtype(tie_values.dtype, np.datetime64) - ): - raise ValueError("`tie_values` must have either numeric or datetime dtype") - tie_indices = tie_indices.astype(int) - self.data = dict(tie_indices=tie_indices, tie_values=tie_values) - self.dim = dim - - @staticmethod - def isvalid(data): - match data: - case {"tie_indices": _, "tie_values": _}: - return True - case _: - return False - - def __len__(self): - if self.empty: - return 0 - else: - return self.tie_indices[-1] - self.tie_indices[0] + 1 - - def __repr__(self): - if len(self) == 0: - return "empty coordinate" - elif len(self) == 1: - return f"{self.tie_values[0]}" - else: - if np.issubdtype(self.dtype, np.floating): - return f"{self.tie_values[0]:.3f} to {self.tie_values[-1]:.3f}" - elif np.issubdtype(self.dtype, np.datetime64): - start = format_datetime(self.tie_values[0]) - end = format_datetime(self.tie_values[-1]) - return f"{start} to {end}" - else: - return f"{self.tie_values[0]} to {self.tie_values[-1]}" - - def __getitem__(self, item): - if isinstance(item, slice): - return self.slice_index(item) - elif np.isscalar(item): - return ScalarCoordinate(self.get_value(item), None) - else: - return DenseCoordinate(self.get_value(item), self.dim) - - def __add__(self, other): - return self.__class__( - {"tie_indices": self.tie_indices, "tie_values": self.tie_values + other}, - self.dim, - ) - - def __sub__(self, other): - return self.__class__( - {"tie_indices": self.tie_indices, "tie_values": self.tie_values - other}, - self.dim, - ) - - def __array__(self, dtype=None): - out = self.values - if dtype is not None: - out = out.__array__(dtype) - return out - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - raise NotImplementedError - - def __array_function__(self, func, types, args, kwargs): - raise NotImplementedError - - @property - def tie_indices(self): - return self.data["tie_indices"] - - @property - def tie_values(self): - return self.data["tie_values"] - - @property - def empty(self): - return self.tie_indices.shape == (0,) - - @property - def dtype(self): - return self.tie_values.dtype - - @property - def ndim(self): - return self.tie_values.ndim - - @property - def shape(self): - return (len(self),) - - @property - def indices(self): - if self.empty: - return np.array([], dtype="int") - else: - return np.arange(self.tie_indices[-1] + 1) - - @property - def values(self): - if self.empty: - return np.array([], dtype=self.dtype) - else: - return self.get_value(self.indices) - - def equals(self, other): - return ( - np.array_equal(self.tie_indices, other.tie_indices) - and np.array_equal(self.tie_values, other.tie_values) - and self.dim == other.dim - and self.dtype == other.dtype - ) - - def get_value(self, index): - index = self.format_index(index) - return forward(index, self.tie_indices, self.tie_values) - - def slice_index(self, index_slice): - index_slice = self.format_index_slice(index_slice) - start_index, stop_index, step_index = ( - index_slice.start, - index_slice.stop, - index_slice.step, - ) - if stop_index - start_index <= 0: - return self.__class__(dict(tie_indices=[], tie_values=[]), dim=self.dim) - elif (stop_index - start_index) <= step_index: - tie_indices = [0] - tie_values = [self.get_value(start_index)] - return self.__class__( - dict(tie_indices=tie_indices, tie_values=tie_values), dim=self.dim - ) - else: - end_index = stop_index - 1 - start_value = self.get_value(start_index) - end_value = self.get_value(end_index) - mask = (start_index < self.tie_indices) & (self.tie_indices < end_index) - tie_indices = np.insert( - self.tie_indices[mask], - (0, self.tie_indices[mask].size), - (start_index, end_index), - ) - tie_values = np.insert( - self.tie_values[mask], - (0, self.tie_values[mask].size), - (start_value, end_value), - ) - tie_indices -= tie_indices[0] - data = {"tie_indices": tie_indices, "tie_values": tie_values} - coord = self.__class__(data, self.dim) - if step_index != 1: - coord = coord.decimate(step_index) - return coord - - def get_indexer(self, value, method=None): - if isinstance(value, str): - value = np.datetime64(value) - else: - value = np.asarray(value) - try: - indexer = inverse(value, self.tie_indices, self.tie_values, method) - except ValueError as e: - if str(e) == "fp must be strictly increasing": - raise ValueError( - "overlaps were found in the coordinate. If this is due to some " - "jitter in the tie values, consider smoothing the coordinate by " - "including some tolerance. This can be done by " - "`da[dim] = da[dim].simplify(tolerance)`, or by specifying a " - "tolerance when opening multiple files." - ) - else: - raise e - return indexer - - def append(self, other): - if not isinstance(other, self.__class__): - raise TypeError(f"cannot append {type(other)} to {self.__class__}") - if not self.dim == other.dim: - raise ValueError("cannot append coordinate with different dimension") - if self.empty: - return other - if other.empty: - return self - if not self.dtype == other.dtype: - raise ValueError("cannot append coordinate with different dtype") - coord = self.__class__( - { - "tie_indices": np.append( - self.tie_indices, other.tie_indices + len(self) - ), - "tie_values": np.append(self.tie_values, other.tie_values), - }, - self.dim, - ) - return coord - - def decimate(self, q): - tie_indices = (self.tie_indices // q) * q - for k in range(1, len(tie_indices) - 1): - if tie_indices[k] == tie_indices[k - 1]: - tie_indices[k] += q - tie_values = [self.get_value(idx) for idx in tie_indices] - tie_indices //= q - return self.__class__( - dict(tie_indices=tie_indices, tie_values=tie_values), self.dim - ) - - def simplify(self, tolerance=None): - if tolerance is None: - if np.issubdtype(self.dtype, np.datetime64): - tolerance = np.timedelta64(0, "ns") - else: - tolerance = 0.0 - tie_indices, tie_values = douglas_peucker( - self.tie_indices, self.tie_values, tolerance - ) - return self.__class__( - dict(tie_indices=tie_indices, tie_values=tie_values), self.dim - ) - - def get_discontinuities(self): - """ - Returns a DataFrame containing information about the discontinuities. - - Returns - ------- - pandas.DataFrame - A DataFrame with the following columns: - - - start_index : int - The index where the discontinuity starts. - - end_index : int - The index where the discontinuity ends. - - start_value : float - The value at the start of the discontinuity. - - end_value : float - The value at the end of the discontinuity. - - delta : float - The difference between the end_value and start_value. - - type : str - The type of the discontinuity, either "gap" or "overlap". - - """ - (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) - records = [] - for index in indices: - start_index = self.tie_indices[index] - end_index = self.tie_indices[index + 1] - start_value = self.tie_values[index] - end_value = self.tie_values[index + 1] - record = { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": ("gap" if end_value > start_value else "overlap"), - } - records.append(record) - return pd.DataFrame.from_records(records) - - def get_availabilities(self): - """ - Returns a DataFrame containing information about the data availability. - - Returns - ------- - pandas.DataFrame - A DataFrame with the following columns: - - - start_index : int - The index where the discontinuity starts. - - end_index : int - The index where the discontinuity ends. - - start_value : float - The value at the start of the discontinuity. - - end_value : float - The value at the end of the discontinuity. - - delta : float - The difference between the end_value and start_value. - - type : str - The type of the discontinuity, always "data". - - """ - if self.empty: - return pd.DataFrame( - columns=[ - "start_index", - "end_index", - "start_value", - "end_value", - "delta", - "type", - ] - ) - (cut_pos,) = np.nonzero(np.diff(self.tie_indices) == 1) - # start each segment after the previous cut (or at 0) - starts = np.concatenate(([0], cut_pos + 1)) - # end each segment at the cut position (or at n-1 for the last) - ends = np.concatenate((cut_pos, [len(self.tie_indices) - 1])) - records = [] - for s, e in zip(starts, ends): - start_index = self.tie_indices[s] - end_index = self.tie_indices[e] - start_value = self.tie_values[s] - end_value = self.tie_values[e] - records.append( - { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": "data", - } - ) - return pd.DataFrame.from_records(records) - - @classmethod - def from_array(cls, arr, dim=None, tolerance=None): - return cls( - {"tie_indices": np.arange(len(arr)), "tie_values": arr}, dim - ).simplify(tolerance) - - def to_dict(self): - tie_indices = self.data["tie_indices"] - tie_values = self.data["tie_values"] - if np.issubdtype(tie_values.dtype, np.datetime64): - tie_values = tie_values.astype(str) - data = { - "tie_indices": tie_indices.tolist(), - "tie_values": tie_values.tolist(), - } - return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - - def to_dataset(self, dataset, attrs): - mapping = f"{self.name}: {self.name}_indices {self.name}_values" - if "coordinate_interpolation" in attrs: - attrs["coordinate_interpolation"] += " " + mapping - else: - attrs["coordinate_interpolation"] = mapping - tie_indices = self.tie_indices - tie_values = ( - self.tie_values.astype("M8[ns]") - if np.issubdtype(self.tie_values.dtype, np.datetime64) - else self.tie_values - ) - interp_attrs = { - "interpolation_name": "linear", - "tie_points_mapping": f"{self.name}_points: {self.name}_indices {self.name}_values", - } - dataset.update( - { - f"{self.name}_interpolation": ((), np.nan, interp_attrs), - f"{self.name}_indices": (f"{self.name}_points", tie_indices), - f"{self.name}_values": (f"{self.name}_points", tie_values), - } - ) - return dataset, attrs - - @classmethod - def from_dataset(cls, dataset, name): - coords = {} - mapping = dataset[name].attrs.pop("coordinate_interpolation", None) - if mapping is not None: - matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) - for match in matches: - dim, indices, values = match - data = {"tie_indices": dataset[indices], "tie_values": dataset[values]} - coords[dim] = Coordinate(data, dim) - return coords - - class SampledCoordinate(Coordinate): """ A coordinate that is sampled at regular intervals. @@ -1193,6 +785,9 @@ def isvalid(data): case _: return False + def issampled(self): + return True + def __len__(self): if self.empty: return 0 @@ -1580,7 +1175,7 @@ def get_sampling_interval(da, dim, cast=True): "cannot compute sample spacing on a dimension with less than 2 points" ) coord = da[dim] - if isinstance(coord, InterpCoordinate): + if coord.isinterp(): num = np.diff(coord.tie_values) den = np.diff(coord.tie_indices) mask = den != 1 @@ -1604,28 +1199,6 @@ def is_strictly_increasing(x): return np.all(np.diff(x) > 0) -def douglas_peucker(x, y, epsilon): - mask = np.ones(len(x), dtype=bool) - stack = [(0, len(x))] - while stack: - start, stop = stack.pop() - ysimple = forward( - x[start:stop], - x[[start, stop - 1]], - y[[start, stop - 1]], - ) - d = np.abs(y[start:stop] - ysimple) - index = np.argmax(d) - dmax = d[index] - index += start - if dmax > epsilon: - stack.append([start, index + 1]) - stack.append([index, stop]) - else: - mask[start + 1 : stop - 1] = False - return x[mask], y[mask] - - def format_datetime(x): string = str(x) if "." in string: diff --git a/xdas/core/coordinates/interp.py b/xdas/core/coordinates/interp.py new file mode 100644 index 00000000..5e60be34 --- /dev/null +++ b/xdas/core/coordinates/interp.py @@ -0,0 +1,454 @@ +import re + +import numpy as np +import pandas as pd +from xinterp import forward, inverse + +from .core import Coordinate, format_datetime, is_strictly_increasing, parse + + +class InterpCoordinate(Coordinate): + """ + Array-like object used to represent piecewise evenly spaced coordinates using the + CF convention. + + The coordinate ticks are describes by the mean of tie points that are interpolated + when intermediate values are required. Coordinate objects provides label based + selections methods. + + Parameters + ---------- + tie_indices : sequence of integers + The indices of the tie points. Must include index 0 and be strictly increasing. + tie_values : sequence of float or datetime64 + The values of the tie points. Must be strictly increasing to enable label-based + selection. The len of `tie_indices` and `tie_values` sizes must match. + """ + + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + if data is None: + data = {"tie_indices": [], "tie_values": []} + data, dim = parse(data, dim) + if not self.__class__.isvalid(data): + raise TypeError("`data` must be dict-like") + if not set(data) == {"tie_indices", "tie_values"}: + raise ValueError( + "both `tie_indices` and `tie_values` key should be provided" + ) + tie_indices = np.asarray(data["tie_indices"]) + tie_values = np.asarray(data["tie_values"], dtype=dtype) + if not tie_indices.ndim == 1: + raise ValueError("`tie_indices` must be 1D") + if not tie_values.ndim == 1: + raise ValueError("`tie_values` must be 1D") + if not len(tie_indices) == len(tie_values): + raise ValueError("`tie_indices` and `tie_values` must have the same length") + if not tie_indices.shape == (0,): + if not np.issubdtype(tie_indices.dtype, np.integer): + raise ValueError("`tie_indices` must be integer-like") + if not tie_indices[0] == 0: + raise ValueError("`tie_indices` must start with a zero") + if not is_strictly_increasing(tie_indices): + raise ValueError("`tie_indices` must be strictly increasing") + if not ( + np.issubdtype(tie_values.dtype, np.number) + or np.issubdtype(tie_values.dtype, np.datetime64) + ): + raise ValueError("`tie_values` must have either numeric or datetime dtype") + tie_indices = tie_indices.astype(int) + self.data = dict(tie_indices=tie_indices, tie_values=tie_values) + self.dim = dim + + @staticmethod + def isvalid(data): + match data: + case {"tie_indices": _, "tie_values": _}: + return True + case _: + return False + + def isinterp(self): + return True + + def __len__(self): + if self.empty: + return 0 + else: + return self.tie_indices[-1] - self.tie_indices[0] + 1 + + def __repr__(self): + if len(self) == 0: + return "empty coordinate" + elif len(self) == 1: + return f"{self.tie_values[0]}" + else: + if np.issubdtype(self.dtype, np.floating): + return f"{self.tie_values[0]:.3f} to {self.tie_values[-1]:.3f}" + elif np.issubdtype(self.dtype, np.datetime64): + start = format_datetime(self.tie_values[0]) + end = format_datetime(self.tie_values[-1]) + return f"{start} to {end}" + else: + return f"{self.tie_values[0]} to {self.tie_values[-1]}" + + def __getitem__(self, item): + if isinstance(item, slice): + return self.slice_index(item) + elif np.isscalar(item): + return Coordinate(self.get_value(item), None) + else: + return Coordinate(self.get_value(item), self.dim) + + def __add__(self, other): + return self.__class__( + {"tie_indices": self.tie_indices, "tie_values": self.tie_values + other}, + self.dim, + ) + + def __sub__(self, other): + return self.__class__( + {"tie_indices": self.tie_indices, "tie_values": self.tie_values - other}, + self.dim, + ) + + def __array__(self, dtype=None): + out = self.values + if dtype is not None: + out = out.__array__(dtype) + return out + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + raise NotImplementedError + + def __array_function__(self, func, types, args, kwargs): + raise NotImplementedError + + @property + def tie_indices(self): + return self.data["tie_indices"] + + @property + def tie_values(self): + return self.data["tie_values"] + + @property + def empty(self): + return self.tie_indices.shape == (0,) + + @property + def dtype(self): + return self.tie_values.dtype + + @property + def ndim(self): + return self.tie_values.ndim + + @property + def shape(self): + return (len(self),) + + @property + def indices(self): + if self.empty: + return np.array([], dtype="int") + else: + return np.arange(self.tie_indices[-1] + 1) + + @property + def values(self): + if self.empty: + return np.array([], dtype=self.dtype) + else: + return self.get_value(self.indices) + + def equals(self, other): + return ( + np.array_equal(self.tie_indices, other.tie_indices) + and np.array_equal(self.tie_values, other.tie_values) + and self.dim == other.dim + and self.dtype == other.dtype + ) + + def get_value(self, index): + index = self.format_index(index) + return forward(index, self.tie_indices, self.tie_values) + + def slice_index(self, index_slice): + index_slice = self.format_index_slice(index_slice) + start_index, stop_index, step_index = ( + index_slice.start, + index_slice.stop, + index_slice.step, + ) + if stop_index - start_index <= 0: + return self.__class__(dict(tie_indices=[], tie_values=[]), dim=self.dim) + elif (stop_index - start_index) <= step_index: + tie_indices = [0] + tie_values = [self.get_value(start_index)] + return self.__class__( + dict(tie_indices=tie_indices, tie_values=tie_values), dim=self.dim + ) + else: + end_index = stop_index - 1 + start_value = self.get_value(start_index) + end_value = self.get_value(end_index) + mask = (start_index < self.tie_indices) & (self.tie_indices < end_index) + tie_indices = np.insert( + self.tie_indices[mask], + (0, self.tie_indices[mask].size), + (start_index, end_index), + ) + tie_values = np.insert( + self.tie_values[mask], + (0, self.tie_values[mask].size), + (start_value, end_value), + ) + tie_indices -= tie_indices[0] + data = {"tie_indices": tie_indices, "tie_values": tie_values} + coord = self.__class__(data, self.dim) + if step_index != 1: + coord = coord.decimate(step_index) + return coord + + def get_indexer(self, value, method=None): + if isinstance(value, str): + value = np.datetime64(value) + else: + value = np.asarray(value) + try: + indexer = inverse(value, self.tie_indices, self.tie_values, method) + except ValueError as e: + if str(e) == "fp must be strictly increasing": + raise ValueError( + "overlaps were found in the coordinate. If this is due to some " + "jitter in the tie values, consider smoothing the coordinate by " + "including some tolerance. This can be done by " + "`da[dim] = da[dim].simplify(tolerance)`, or by specifying a " + "tolerance when opening multiple files." + ) + else: + raise e + return indexer + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + if self.empty: + return other + if other.empty: + return self + if not self.dtype == other.dtype: + raise ValueError("cannot append coordinate with different dtype") + coord = self.__class__( + { + "tie_indices": np.append( + self.tie_indices, other.tie_indices + len(self) + ), + "tie_values": np.append(self.tie_values, other.tie_values), + }, + self.dim, + ) + return coord + + def decimate(self, q): + tie_indices = (self.tie_indices // q) * q + for k in range(1, len(tie_indices) - 1): + if tie_indices[k] == tie_indices[k - 1]: + tie_indices[k] += q + tie_values = [self.get_value(idx) for idx in tie_indices] + tie_indices //= q + return self.__class__( + dict(tie_indices=tie_indices, tie_values=tie_values), self.dim + ) + + def simplify(self, tolerance=None): + if tolerance is None: + if np.issubdtype(self.dtype, np.datetime64): + tolerance = np.timedelta64(0, "ns") + else: + tolerance = 0.0 + tie_indices, tie_values = douglas_peucker( + self.tie_indices, self.tie_values, tolerance + ) + return self.__class__( + dict(tie_indices=tie_indices, tie_values=tie_values), self.dim + ) + + def get_discontinuities(self): + """ + Returns a DataFrame containing information about the discontinuities. + + Returns + ------- + pandas.DataFrame + A DataFrame with the following columns: + + - start_index : int + The index where the discontinuity starts. + - end_index : int + The index where the discontinuity ends. + - start_value : float + The value at the start of the discontinuity. + - end_value : float + The value at the end of the discontinuity. + - delta : float + The difference between the end_value and start_value. + - type : str + The type of the discontinuity, either "gap" or "overlap". + + """ + (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) + records = [] + for index in indices: + start_index = self.tie_indices[index] + end_index = self.tie_indices[index + 1] + start_value = self.tie_values[index] + end_value = self.tie_values[index + 1] + record = { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": ("gap" if end_value > start_value else "overlap"), + } + records.append(record) + return pd.DataFrame.from_records(records) + + def get_availabilities(self): + """ + Returns a DataFrame containing information about the data availability. + + Returns + ------- + pandas.DataFrame + A DataFrame with the following columns: + + - start_index : int + The index where the discontinuity starts. + - end_index : int + The index where the discontinuity ends. + - start_value : float + The value at the start of the discontinuity. + - end_value : float + The value at the end of the discontinuity. + - delta : float + The difference between the end_value and start_value. + - type : str + The type of the discontinuity, always "data". + + """ + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + (cut_pos,) = np.nonzero(np.diff(self.tie_indices) == 1) + # start each segment after the previous cut (or at 0) + starts = np.concatenate(([0], cut_pos + 1)) + # end each segment at the cut position (or at n-1 for the last) + ends = np.concatenate((cut_pos, [len(self.tie_indices) - 1])) + records = [] + for s, e in zip(starts, ends): + start_index = self.tie_indices[s] + end_index = self.tie_indices[e] + start_value = self.tie_values[s] + end_value = self.tie_values[e] + records.append( + { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": "data", + } + ) + return pd.DataFrame.from_records(records) + + @classmethod + def from_array(cls, arr, dim=None, tolerance=None): + return cls( + {"tie_indices": np.arange(len(arr)), "tie_values": arr}, dim + ).simplify(tolerance) + + def to_dict(self): + tie_indices = self.data["tie_indices"] + tie_values = self.data["tie_values"] + if np.issubdtype(tie_values.dtype, np.datetime64): + tie_values = tie_values.astype(str) + data = { + "tie_indices": tie_indices.tolist(), + "tie_values": tie_values.tolist(), + } + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + + def to_dataset(self, dataset, attrs): + mapping = f"{self.name}: {self.name}_indices {self.name}_values" + if "coordinate_interpolation" in attrs: + attrs["coordinate_interpolation"] += " " + mapping + else: + attrs["coordinate_interpolation"] = mapping + tie_indices = self.tie_indices + tie_values = ( + self.tie_values.astype("M8[ns]") + if np.issubdtype(self.tie_values.dtype, np.datetime64) + else self.tie_values + ) + interp_attrs = { + "interpolation_name": "linear", + "tie_points_mapping": f"{self.name}_points: {self.name}_indices {self.name}_values", + } + dataset.update( + { + f"{self.name}_interpolation": ((), np.nan, interp_attrs), + f"{self.name}_indices": (f"{self.name}_points", tie_indices), + f"{self.name}_values": (f"{self.name}_points", tie_values), + } + ) + return dataset, attrs + + @classmethod + def from_dataset(cls, dataset, name): + coords = {} + mapping = dataset[name].attrs.pop("coordinate_interpolation", None) + if mapping is not None: + matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) + for match in matches: + dim, indices, values = match + data = {"tie_indices": dataset[indices], "tie_values": dataset[values]} + coords[dim] = Coordinate(data, dim) + return coords + + +def douglas_peucker(x, y, epsilon): + mask = np.ones(len(x), dtype=bool) + stack = [(0, len(x))] + while stack: + start, stop = stack.pop() + ysimple = forward( + x[start:stop], + x[[start, stop - 1]], + y[[start, stop - 1]], + ) + d = np.abs(y[start:stop] - ysimple) + index = np.argmax(d) + dmax = d[index] + index += start + if dmax > epsilon: + stack.append([start, index + 1]) + stack.append([index, stop]) + else: + mask[start + 1 : stop - 1] = False + return x[mask], y[mask] From d9c82d204b91ffddbb6cc24f238ff9f3f24ffe4a Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 11:58:31 +0100 Subject: [PATCH 035/104] Refactoring: separeate sampled module for SamplesCoordinate. --- xdas/core/coordinates/__init__.py | 2 +- xdas/core/coordinates/core.py | 459 +---------------------------- xdas/core/coordinates/sampled.py | 463 ++++++++++++++++++++++++++++++ 3 files changed, 465 insertions(+), 459 deletions(-) create mode 100644 xdas/core/coordinates/sampled.py diff --git a/xdas/core/coordinates/__init__.py b/xdas/core/coordinates/__init__.py index 49f17258..d9105307 100644 --- a/xdas/core/coordinates/__init__.py +++ b/xdas/core/coordinates/__init__.py @@ -3,8 +3,8 @@ Coordinates, DefaultCoordinate, DenseCoordinate, - SampledCoordinate, ScalarCoordinate, get_sampling_interval, ) from .interp import InterpCoordinate +from .sampled import SampledCoordinate diff --git a/xdas/core/coordinates/core.py b/xdas/core/coordinates/core.py index ec4dea16..5dc05597 100644 --- a/xdas/core/coordinates/core.py +++ b/xdas/core/coordinates/core.py @@ -682,463 +682,6 @@ def from_dataset(cls, dataset, name): } -class SampledCoordinate(Coordinate): - """ - A coordinate that is sampled at regular intervals. - - Parameters - ---------- - data : dict-like - The data of the coordinate. - dim : str, optional - The dimension name of the coordinate, by default None. - dtype : str or numpy.dtype, optional - The data type of the coordinate, by default None. - """ - - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, data=None, dim=None, dtype=None): - # empty - if data is None: - data = {"tie_values": [], "tie_lengths": [], "sampling_interval": None} - empty = True - else: - empty = False - - # parse data - data, dim = parse(data, dim) - if not self.__class__.isvalid(data): - raise TypeError( - "`data` must be dict-like and contain `tie_values`, `tie_lengths`, and " - "`sampling_interval`" - ) - tie_values = np.asarray(data["tie_values"], dtype=dtype) - tie_lengths = np.asarray(data["tie_lengths"]) - sampling_interval = data["sampling_interval"] - - # check shapes - if not tie_values.ndim == 1: - raise ValueError("`tie_values` must be 1D") - if not tie_lengths.ndim == 1: - raise ValueError("`tie_lengths` must be 1D") - if not len(tie_values) == len(tie_lengths): - raise ValueError("`tie_values` and `tie_lengths` must have the same length") - - # check dtypes - if not empty: - if not ( - np.issubdtype(tie_values.dtype, np.number) - or np.issubdtype(tie_values.dtype, np.datetime64) - ): - raise ValueError( - "`tie_values` must have either numeric or datetime dtype" - ) - if not np.issubdtype(tie_lengths.dtype, np.integer): - raise ValueError("`tie_lengths` must be integer-like") - if not np.all(tie_lengths > 0): - raise ValueError("`tie_lengths` must be strictly positive integers") - if not np.isscalar(sampling_interval): - raise ValueError("`sampling_interval` must be a scalar value") - if np.issubdtype(tie_values.dtype, np.datetime64): - if not np.issubdtype( - np.asarray(sampling_interval).dtype, np.timedelta64 - ): - raise ValueError( - "`sampling_interval` must be timedelta64 for datetime64 `tie_values`" - ) - - # store data - self.data = { - "tie_values": tie_values, - "tie_lengths": tie_lengths, - "sampling_interval": sampling_interval, - } - self.dim = dim - - @property - def tie_values(self): - return self.data["tie_values"] - - @property - def tie_lengths(self): - return self.data["tie_lengths"] - - @property - def sampling_interval(self): - return self.data["sampling_interval"] - - @property - def dtype(self): - return self.tie_values.dtype - - @staticmethod - def isvalid(data): - match data: - case { - "tie_values": _, - "tie_lengths": _, - "sampling_interval": _, - }: - return True - case _: - return False - - def issampled(self): - return True - - def __len__(self): - if self.empty: - return 0 - else: - return sum(self.tie_lengths) - - def __repr__(self): - if self.empty: - return "empty coordinate" - elif len(self) == 1: - return f"{self.tie_values[0]}" - else: - if np.issubdtype(self.dtype, np.floating): - return f"{self.start:.3f} to {self.end:.3f}" - elif np.issubdtype(self.dtype, np.datetime64): - start_str = format_datetime(self.start) - end_str = format_datetime(self.end) - return f"{start_str} to {end_str}" - else: - return f"{self.start} to {self.end}" - - def __getitem__(self, item): - if isinstance(item, slice): - return self.slice_index(item) - elif np.isscalar(item): - return ScalarCoordinate(self.get_value(item), None) - else: - return DenseCoordinate(self.get_value(item), self.dim) - - def __add__(self, other): - return self.__class__( - { - "tie_values": self.tie_values + other, - "tie_lengths": self.tie_lengths, - "sampling_interval": self.sampling_interval, - }, - self.dim, - ) - - def __sub__(self, other): - return self.__class__( - { - "tie_values": self.tie_values - other, - "tie_lengths": self.tie_lengths, - "sampling_interval": self.sampling_interval, - }, - self.dim, - ) - - def __array__(self, dtype=None): - out = self.values - if dtype is not None: - out = out.__array__(dtype) - return out - - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - raise NotImplementedError - - def __array_function__(self, func, types, args, kwargs): - raise NotImplementedError - - @property - def tie_indices(self): - return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) - - @property - def empty(self): - return self.tie_values.shape == (0,) - - @property - def ndim(self): - return self.tie_values.ndim - - @property - def shape(self): - return (len(self),) - - @property - def indices(self): - if self.empty: - return np.array([], dtype="int") - else: - return np.arange(len(self)) - - @property - def values(self): - if self.empty: - return np.array([], dtype=self.dtype) - else: - return self.get_value(self.indices) - - @property - def start(self): - return self.tie_values[0] - - @property - def end(self): - return self.tie_values[-1] + self.sampling_interval * self.tie_lengths[-1] - - def equals(self, other): - return ( - np.array_equal(self.tie_values, other.tie_values) - and np.array_equal(self.tie_lengths, other.tie_lengths) - and self.sampling_interval == other.sampling_interval - and self.dim == other.dim - and self.dtype == other.dtype - ) - - def get_value(self, index): - index = self.format_index(index) - if np.any(index < 0) or np.any(index >= len(self)): - raise IndexError("index is out of bounds") - reference = np.searchsorted(self.tie_indices, index, side="right") - 1 - return self.tie_values[reference] + ( - (index - self.tie_indices[reference]) * self.sampling_interval - ) - - def slice_index(self, index_slice): - index_slice = self.format_index_slice(index_slice) - - # TODO: optimize when start and/or stop are None - - # get indices relative to tie points - relative_start_index = np.clip( - index_slice.start - self.tie_indices, 0, self.tie_lengths - ) - relative_stop_index = np.clip( - index_slice.stop - self.tie_indices, 0, self.tie_lengths - ) - - # keep segments with data - mask = relative_start_index < relative_stop_index - - # compute new tie points ane lengths - tie_values = ( - self.tie_values[mask] + relative_start_index[mask] * self.sampling_interval - ) - tie_lengths = relative_stop_index[mask] - relative_start_index[mask] - - # adjust for step if needed - if index_slice.step == 1: - sampling_interval = self.sampling_interval - else: - tie_lengths = (self.tie_lengths + index_slice.step - 1) // index_slice.step - sampling_interval = self.sampling_interval * index_slice.step - - # build new coordinate - data = { - "tie_values": tie_values, - "tie_lengths": tie_lengths, - "sampling_interval": sampling_interval, - } - return self.__class__(data, self.dim) - - def get_indexer(self, value, method=None): - if isinstance(value, str): - value = np.datetime64(value) - else: - value = np.asarray(value) - # Check that value lies within the coordinate value range (vectorized) - if np.any(value < self.start) or np.any(value >= self.end): - raise KeyError("index not found") - if not is_strictly_increasing(self.tie_values): - raise ValueError("tie_values must be strictly increasing") - reference = np.searchsorted(self.tie_values, value, side="right") - 1 - offset = (value - self.tie_values[reference]) / self.sampling_interval - match method: - case None: - if np.any(offset % 1 != 0): - raise KeyError("index not found") - offset = offset.astype(int) - case "nearest": - offset = np.round(offset).astype(int) - case "ffill": - offset = np.floor(offset).astype(int) - case "bfill": - offset = np.ceil(offset).astype(int) - case _: - raise ValueError( - "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" - ) - return self.tie_indices[reference] + offset - - def append(self, other): - if not isinstance(other, self.__class__): - raise TypeError(f"cannot append {type(other)} to {self.__class__}") - if not self.dim == other.dim: - raise ValueError("cannot append coordinate with different dimension") - if self.empty: - return other - if other.empty: - return self - if not self.dtype == other.dtype: - raise ValueError("cannot append coordinate with different dtype") - if not self.sampling_interval == other.sampling_interval: - raise ValueError( - "cannot append coordinate with different sampling intervals" - ) - tie_values = np.concatenate([self.tie_values, other.tie_values]) - tie_lengths = np.concatenate([self.tie_lengths, other.tie_lengths]) - return self.__class__( - { - "tie_values": tie_values, - "tie_lengths": tie_lengths, - "sampling_interval": self.sampling_interval, - }, - self.dim, - ) - - def decimate(self, q): - return self[::q] - - def simplify(self, tolerance=None): - tie_values = [self.tie_values[0]] - tie_lengths = [self.tie_lengths[0]] - for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): - delta = value - (tie_values[-1] + self.sampling_interval * tie_lengths[-1]) - if np.abs(delta) <= tolerance: - tie_lengths[-1] += length - else: - tie_values.append(value) - tie_lengths.append(length) - return self.__class__( - { - "tie_values": np.array(tie_values), - "tie_lengths": np.array(tie_lengths), - "sampling_interval": self.sampling_interval, - }, - self.dim, - ) - - def get_discontinuities(self): - if self.empty: - return pd.DataFrame( - columns=[ - "start_index", - "end_index", - "start_value", - "end_value", - "delta", - "type", - ] - ) - records = [] - for index in self.tie_indices[:-1]: - start_index = index - end_index = index + 1 - start_value = self.get_value(index) - end_value = self.get_value(index + 1) - record = { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": ("gap" if end_value > start_value else "overlap"), - } - records.append(record) - return pd.DataFrame.from_records(records) - - def get_availabilities(self): - if self.empty: - return pd.DataFrame( - columns=[ - "start_index", - "end_index", - "start_value", - "end_value", - "delta", - "type", - ] - ) - records = [] - for index, value, length in zip( - self.tie_indices, self.tie_values, self.tie_indices - ): - start_index = index - end_index = index + length - 1 - start_value = value - end_value = value + self.sampling_interval * (length - 1) - records.append( - { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": "data", - } - ) - return pd.DataFrame.from_records(records) - - @classmethod - def from_array(cls, arr, dim=None, sampling_interval=None): - raise NotImplementedError("from_array is not implemented for SampledCoordinate") - - def to_dict(self): - tie_values = self.data["tie_values"] - tie_lengths = self.data["tie_lengths"] - if np.issubdtype(tie_values.dtype, np.datetime64): - tie_values = tie_values.astype(str) - data = { - "tie_values": tie_values.tolist(), - "tie_lengths": tie_lengths.tolist(), - "sampling_interval": self.sampling_interval, - } - return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - - def to_dataset(self, dataset, attrs): - mapping = f"{self.name}: {self.name}_values {self.name}_lengths" - if "coordinate_sampling" in attrs: - attrs["coordinate_sampling"] += " " + mapping - else: - attrs["coordinate_sampling"] = mapping - tie_values = ( - self.tie_values.astype("M8[ns]") - if np.issubdtype(self.tie_values.dtype, np.datetime64) - else self.tie_values - ) - tie_lengths = self.tie_lengths - interp_attrs = { - "sampling_interval": self.sampling_interval, - "tie_points_mapping": f"{self.name}_points: {self.name}_values {self.name}_lengths", - } - dataset.update( - { - f"{self.name}_sampling": ((), np.nan, interp_attrs), - f"{self.name}_values": (f"{self.name}_points", tie_values), - f"{self.name}_lengths": (f"{self.name}_points", tie_lengths), - } - ) - return dataset, attrs - - @classmethod - def from_dataset(cls, dataset, name): - coords = {} - mapping = dataset[name].attrs.pop("coordinate_sampling", None) - if mapping is not None: - matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) - for match in matches: - dim, values, lengths = match - sampling_interval = ... - data = { - "tie_values": dataset[values], - "tie_lengths": dataset[lengths], - "sampling_interval": sampling_interval, - } - coords[dim] = Coordinate(data, dim) - return coords - - def parse(data, dim=None): if isinstance(data, tuple): if dim is None: @@ -1182,7 +725,7 @@ def get_sampling_interval(da, dim, cast=True): num = num[mask] den = den[mask] d = np.median(num / den) - elif isinstance(coord, SampledCoordinate): + elif coord.issampled(): d = coord.sampling_interval else: d = (coord[-1].values - coord[0].values) / (len(coord) - 1) diff --git a/xdas/core/coordinates/sampled.py b/xdas/core/coordinates/sampled.py new file mode 100644 index 00000000..70b2e0a4 --- /dev/null +++ b/xdas/core/coordinates/sampled.py @@ -0,0 +1,463 @@ +import re + +import numpy as np +import pandas as pd + +from .core import Coordinate, format_datetime, is_strictly_increasing, parse + + +class SampledCoordinate(Coordinate): + """ + A coordinate that is sampled at regular intervals. + + Parameters + ---------- + data : dict-like + The data of the coordinate. + dim : str, optional + The dimension name of the coordinate, by default None. + dtype : str or numpy.dtype, optional + The data type of the coordinate, by default None. + """ + + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + # empty + if data is None: + data = {"tie_values": [], "tie_lengths": [], "sampling_interval": None} + empty = True + else: + empty = False + + # parse data + data, dim = parse(data, dim) + if not self.__class__.isvalid(data): + raise TypeError( + "`data` must be dict-like and contain `tie_values`, `tie_lengths`, and " + "`sampling_interval`" + ) + tie_values = np.asarray(data["tie_values"], dtype=dtype) + tie_lengths = np.asarray(data["tie_lengths"]) + sampling_interval = data["sampling_interval"] + + # check shapes + if not tie_values.ndim == 1: + raise ValueError("`tie_values` must be 1D") + if not tie_lengths.ndim == 1: + raise ValueError("`tie_lengths` must be 1D") + if not len(tie_values) == len(tie_lengths): + raise ValueError("`tie_values` and `tie_lengths` must have the same length") + + # check dtypes + if not empty: + if not ( + np.issubdtype(tie_values.dtype, np.number) + or np.issubdtype(tie_values.dtype, np.datetime64) + ): + raise ValueError( + "`tie_values` must have either numeric or datetime dtype" + ) + if not np.issubdtype(tie_lengths.dtype, np.integer): + raise ValueError("`tie_lengths` must be integer-like") + if not np.all(tie_lengths > 0): + raise ValueError("`tie_lengths` must be strictly positive integers") + if not np.isscalar(sampling_interval): + raise ValueError("`sampling_interval` must be a scalar value") + if np.issubdtype(tie_values.dtype, np.datetime64): + if not np.issubdtype( + np.asarray(sampling_interval).dtype, np.timedelta64 + ): + raise ValueError( + "`sampling_interval` must be timedelta64 for datetime64 `tie_values`" + ) + + # store data + self.data = { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": sampling_interval, + } + self.dim = dim + + @property + def tie_values(self): + return self.data["tie_values"] + + @property + def tie_lengths(self): + return self.data["tie_lengths"] + + @property + def sampling_interval(self): + return self.data["sampling_interval"] + + @property + def dtype(self): + return self.tie_values.dtype + + @staticmethod + def isvalid(data): + match data: + case { + "tie_values": _, + "tie_lengths": _, + "sampling_interval": _, + }: + return True + case _: + return False + + def issampled(self): + return True + + def __len__(self): + if self.empty: + return 0 + else: + return sum(self.tie_lengths) + + def __repr__(self): + if self.empty: + return "empty coordinate" + elif len(self) == 1: + return f"{self.tie_values[0]}" + else: + if np.issubdtype(self.dtype, np.floating): + return f"{self.start:.3f} to {self.end:.3f}" + elif np.issubdtype(self.dtype, np.datetime64): + start_str = format_datetime(self.start) + end_str = format_datetime(self.end) + return f"{start_str} to {end_str}" + else: + return f"{self.start} to {self.end}" + + def __getitem__(self, item): + if isinstance(item, slice): + return self.slice_index(item) + elif np.isscalar(item): + return Coordinate(self.get_value(item), None) + else: + return Coordinate(self.get_value(item), self.dim) + + def __add__(self, other): + return self.__class__( + { + "tie_values": self.tie_values + other, + "tie_lengths": self.tie_lengths, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def __sub__(self, other): + return self.__class__( + { + "tie_values": self.tie_values - other, + "tie_lengths": self.tie_lengths, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def __array__(self, dtype=None): + out = self.values + if dtype is not None: + out = out.__array__(dtype) + return out + + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + raise NotImplementedError + + def __array_function__(self, func, types, args, kwargs): + raise NotImplementedError + + @property + def tie_indices(self): + return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) + + @property + def empty(self): + return self.tie_values.shape == (0,) + + @property + def ndim(self): + return self.tie_values.ndim + + @property + def shape(self): + return (len(self),) + + @property + def indices(self): + if self.empty: + return np.array([], dtype="int") + else: + return np.arange(len(self)) + + @property + def values(self): + if self.empty: + return np.array([], dtype=self.dtype) + else: + return self.get_value(self.indices) + + @property + def start(self): + return self.tie_values[0] + + @property + def end(self): + return self.tie_values[-1] + self.sampling_interval * self.tie_lengths[-1] + + def equals(self, other): + return ( + np.array_equal(self.tie_values, other.tie_values) + and np.array_equal(self.tie_lengths, other.tie_lengths) + and self.sampling_interval == other.sampling_interval + and self.dim == other.dim + and self.dtype == other.dtype + ) + + def get_value(self, index): + index = self.format_index(index) + if np.any(index < 0) or np.any(index >= len(self)): + raise IndexError("index is out of bounds") + reference = np.searchsorted(self.tie_indices, index, side="right") - 1 + return self.tie_values[reference] + ( + (index - self.tie_indices[reference]) * self.sampling_interval + ) + + def slice_index(self, index_slice): + index_slice = self.format_index_slice(index_slice) + + # TODO: optimize when start and/or stop are None + + # get indices relative to tie points + relative_start_index = np.clip( + index_slice.start - self.tie_indices, 0, self.tie_lengths + ) + relative_stop_index = np.clip( + index_slice.stop - self.tie_indices, 0, self.tie_lengths + ) + + # keep segments with data + mask = relative_start_index < relative_stop_index + + # compute new tie points ane lengths + tie_values = ( + self.tie_values[mask] + relative_start_index[mask] * self.sampling_interval + ) + tie_lengths = relative_stop_index[mask] - relative_start_index[mask] + + # adjust for step if needed + if index_slice.step == 1: + sampling_interval = self.sampling_interval + else: + tie_lengths = (self.tie_lengths + index_slice.step - 1) // index_slice.step + sampling_interval = self.sampling_interval * index_slice.step + + # build new coordinate + data = { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": sampling_interval, + } + return self.__class__(data, self.dim) + + def get_indexer(self, value, method=None): + if isinstance(value, str): + value = np.datetime64(value) + else: + value = np.asarray(value) + # Check that value lies within the coordinate value range (vectorized) + if np.any(value < self.start) or np.any(value >= self.end): + raise KeyError("index not found") + if not is_strictly_increasing(self.tie_values): + raise ValueError("tie_values must be strictly increasing") + reference = np.searchsorted(self.tie_values, value, side="right") - 1 + offset = (value - self.tie_values[reference]) / self.sampling_interval + match method: + case None: + if np.any(offset % 1 != 0): + raise KeyError("index not found") + offset = offset.astype(int) + case "nearest": + offset = np.round(offset).astype(int) + case "ffill": + offset = np.floor(offset).astype(int) + case "bfill": + offset = np.ceil(offset).astype(int) + case _: + raise ValueError( + "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" + ) + return self.tie_indices[reference] + offset + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + if self.empty: + return other + if other.empty: + return self + if not self.dtype == other.dtype: + raise ValueError("cannot append coordinate with different dtype") + if not self.sampling_interval == other.sampling_interval: + raise ValueError( + "cannot append coordinate with different sampling intervals" + ) + tie_values = np.concatenate([self.tie_values, other.tie_values]) + tie_lengths = np.concatenate([self.tie_lengths, other.tie_lengths]) + return self.__class__( + { + "tie_values": tie_values, + "tie_lengths": tie_lengths, + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def decimate(self, q): + return self[::q] + + def simplify(self, tolerance=None): + tie_values = [self.tie_values[0]] + tie_lengths = [self.tie_lengths[0]] + for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): + delta = value - (tie_values[-1] + self.sampling_interval * tie_lengths[-1]) + if np.abs(delta) <= tolerance: + tie_lengths[-1] += length + else: + tie_values.append(value) + tie_lengths.append(length) + return self.__class__( + { + "tie_values": np.array(tie_values), + "tie_lengths": np.array(tie_lengths), + "sampling_interval": self.sampling_interval, + }, + self.dim, + ) + + def get_discontinuities(self): + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + records = [] + for index in self.tie_indices[:-1]: + start_index = index + end_index = index + 1 + start_value = self.get_value(index) + end_value = self.get_value(index + 1) + record = { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": ("gap" if end_value > start_value else "overlap"), + } + records.append(record) + return pd.DataFrame.from_records(records) + + def get_availabilities(self): + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + records = [] + for index, value, length in zip( + self.tie_indices, self.tie_values, self.tie_indices + ): + start_index = index + end_index = index + length - 1 + start_value = value + end_value = value + self.sampling_interval * (length - 1) + records.append( + { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": "data", + } + ) + return pd.DataFrame.from_records(records) + + @classmethod + def from_array(cls, arr, dim=None, sampling_interval=None): + raise NotImplementedError("from_array is not implemented for SampledCoordinate") + + def to_dict(self): + tie_values = self.data["tie_values"] + tie_lengths = self.data["tie_lengths"] + if np.issubdtype(tie_values.dtype, np.datetime64): + tie_values = tie_values.astype(str) + data = { + "tie_values": tie_values.tolist(), + "tie_lengths": tie_lengths.tolist(), + "sampling_interval": self.sampling_interval, + } + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + + def to_dataset(self, dataset, attrs): + mapping = f"{self.name}: {self.name}_values {self.name}_lengths" + if "coordinate_sampling" in attrs: + attrs["coordinate_sampling"] += " " + mapping + else: + attrs["coordinate_sampling"] = mapping + tie_values = ( + self.tie_values.astype("M8[ns]") + if np.issubdtype(self.tie_values.dtype, np.datetime64) + else self.tie_values + ) + tie_lengths = self.tie_lengths + interp_attrs = { + "sampling_interval": self.sampling_interval, + "tie_points_mapping": f"{self.name}_points: {self.name}_values {self.name}_lengths", + } + dataset.update( + { + f"{self.name}_sampling": ((), np.nan, interp_attrs), + f"{self.name}_values": (f"{self.name}_points", tie_values), + f"{self.name}_lengths": (f"{self.name}_points", tie_lengths), + } + ) + return dataset, attrs + + @classmethod + def from_dataset(cls, dataset, name): + coords = {} + mapping = dataset[name].attrs.pop("coordinate_sampling", None) + if mapping is not None: + matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) + for match in matches: + dim, values, lengths = match + sampling_interval = ... + data = { + "tie_values": dataset[values], + "tie_lengths": dataset[lengths], + "sampling_interval": sampling_interval, + } + coords[dim] = Coordinate(data, dim) + return coords From 87bc375ac5ee1e0d50837259704ae8889f4edd0e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 12:03:22 +0100 Subject: [PATCH 036/104] Refactoring: separate dense module for DenseCoordinate. --- xdas/core/coordinates/__init__.py | 2 +- xdas/core/coordinates/core.py | 95 ------------------------------ xdas/core/coordinates/dense.py | 97 +++++++++++++++++++++++++++++++ 3 files changed, 98 insertions(+), 96 deletions(-) create mode 100644 xdas/core/coordinates/dense.py diff --git a/xdas/core/coordinates/__init__.py b/xdas/core/coordinates/__init__.py index d9105307..37dbff55 100644 --- a/xdas/core/coordinates/__init__.py +++ b/xdas/core/coordinates/__init__.py @@ -2,9 +2,9 @@ Coordinate, Coordinates, DefaultCoordinate, - DenseCoordinate, ScalarCoordinate, get_sampling_interval, ) +from .dense import DenseCoordinate from .interp import InterpCoordinate from .sampled import SampledCoordinate diff --git a/xdas/core/coordinates/core.py b/xdas/core/coordinates/core.py index 5dc05597..48e8e7b0 100644 --- a/xdas/core/coordinates/core.py +++ b/xdas/core/coordinates/core.py @@ -1,9 +1,7 @@ -import re from copy import copy, deepcopy from functools import wraps import numpy as np -import pandas as pd def wraps_first_last(func): @@ -589,99 +587,6 @@ def to_dict(self): return {"dim": self.dim, "data": self.data.tolist(), "dtype": str(self.dtype)} -class DenseCoordinate(Coordinate): - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, data=None, dim=None, dtype=None): - if data is None: - data = [] - data, dim = parse(data, dim) - if not self.isvalid(data): - raise TypeError("`data` must be array-like") - self.data = np.asarray(data, dtype=dtype) - self.dim = dim - - @staticmethod - def isvalid(data): - data = np.asarray(data) - return (data.dtype != np.dtype(object)) and (data.ndim == 1) - - def isdense(self): - return True - - @property - def index(self): - return pd.Index(self.data) - - def equals(self, other): - if isinstance(other, self.__class__): - return ( - np.array_equal(self.data, other.data) - and self.dim == other.dim - and self.dtype == other.dtype - ) - else: - return False - - def get_indexer(self, value, method=None): - if np.isscalar(value): - out = self.index.get_indexer([value], method).item() - else: - out = self.index.get_indexer(value, method) - if np.any(out == -1): - raise KeyError("index not found") - return out - - def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): - slc = self.index.slice_indexer(start, stop, step) - if ( - (not endpoint) - and (stop is not None) - and (self[slc.stop - 1].values == stop) - ): - slc = slice(slc.start, slc.stop - 1, slc.step) - return slc - - def append(self, other): - if not isinstance(other, self.__class__): - raise TypeError(f"cannot append {type(other)} to {self.__class__}") - if not self.dim == other.dim: - raise ValueError("cannot append coordinate with different dimension") - if self.empty: - return other - if other.empty: - return self - if not self.dtype == other.dtype: - raise ValueError("cannot append coordinate with different dtype") - return self.__class__(np.concatenate([self.data, other.data]), self.dim) - - def to_dict(self): - if np.issubdtype(self.dtype, np.datetime64): - data = self.data.astype(str).tolist() - else: - data = self.data.tolist() - return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - - @classmethod - def from_dataset(cls, dataset, name): - return { - name: ( - ( - coord.dims[0], - ( - coord.values.astype("U") - if coord.dtype == np.dtype("O") - else coord.values - ), - ) - if coord.dims - else coord.values - ) - for name, coord in dataset[name].coords.items() - } - - def parse(data, dim=None): if isinstance(data, tuple): if dim is None: diff --git a/xdas/core/coordinates/dense.py b/xdas/core/coordinates/dense.py new file mode 100644 index 00000000..9ebef73f --- /dev/null +++ b/xdas/core/coordinates/dense.py @@ -0,0 +1,97 @@ +import numpy as np +import pandas as pd + +from .core import Coordinate, parse + + +class DenseCoordinate(Coordinate): + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + if data is None: + data = [] + data, dim = parse(data, dim) + if not self.isvalid(data): + raise TypeError("`data` must be array-like") + self.data = np.asarray(data, dtype=dtype) + self.dim = dim + + @staticmethod + def isvalid(data): + data = np.asarray(data) + return (data.dtype != np.dtype(object)) and (data.ndim == 1) + + def isdense(self): + return True + + @property + def index(self): + return pd.Index(self.data) + + def equals(self, other): + if isinstance(other, self.__class__): + return ( + np.array_equal(self.data, other.data) + and self.dim == other.dim + and self.dtype == other.dtype + ) + else: + return False + + def get_indexer(self, value, method=None): + if np.isscalar(value): + out = self.index.get_indexer([value], method).item() + else: + out = self.index.get_indexer(value, method) + if np.any(out == -1): + raise KeyError("index not found") + return out + + def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + slc = self.index.slice_indexer(start, stop, step) + if ( + (not endpoint) + and (stop is not None) + and (self[slc.stop - 1].values == stop) + ): + slc = slice(slc.start, slc.stop - 1, slc.step) + return slc + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + if self.empty: + return other + if other.empty: + return self + if not self.dtype == other.dtype: + raise ValueError("cannot append coordinate with different dtype") + return self.__class__(np.concatenate([self.data, other.data]), self.dim) + + def to_dict(self): + if np.issubdtype(self.dtype, np.datetime64): + data = self.data.astype(str).tolist() + else: + data = self.data.tolist() + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} + + @classmethod + def from_dataset(cls, dataset, name): + return { + name: ( + ( + coord.dims[0], + ( + coord.values.astype("U") + if coord.dtype == np.dtype("O") + else coord.values + ), + ) + if coord.dims + else coord.values + ) + for name, coord in dataset[name].coords.items() + } From e9d1923bf8eb2ab16f20313e98a765b85ab740f7 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 12:15:00 +0100 Subject: [PATCH 037/104] Refactoring: separate scalar module for ScalarCoordinate. --- xdas/core/coordinates/__init__.py | 2 +- xdas/core/coordinates/core.py | 50 ++---------------------------- xdas/core/coordinates/scalar.py | 51 +++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 49 deletions(-) create mode 100644 xdas/core/coordinates/scalar.py diff --git a/xdas/core/coordinates/__init__.py b/xdas/core/coordinates/__init__.py index 37dbff55..c6a3ca11 100644 --- a/xdas/core/coordinates/__init__.py +++ b/xdas/core/coordinates/__init__.py @@ -2,9 +2,9 @@ Coordinate, Coordinates, DefaultCoordinate, - ScalarCoordinate, get_sampling_interval, ) from .dense import DenseCoordinate from .interp import InterpCoordinate from .sampled import SampledCoordinate +from .scalar import ScalarCoordinate diff --git a/xdas/core/coordinates/core.py b/xdas/core/coordinates/core.py index 48e8e7b0..64c3e980 100644 --- a/xdas/core/coordinates/core.py +++ b/xdas/core/coordinates/core.py @@ -258,6 +258,8 @@ def __new__(cls, data=None, dim=None, dtype=None): raise TypeError("could not parse `data`") def __getitem__(self, item): + from .scalar import ScalarCoordinate + data = self.data.__getitem__(item) if ScalarCoordinate.isvalid(data): return ScalarCoordinate(data) @@ -460,54 +462,6 @@ def from_dataset(cls, dataset, name): return coords -class ScalarCoordinate(Coordinate): - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, data=None, dim=None, dtype=None): - if data is None: - raise TypeError("scalar coordinate cannot be empty, please provide a value") - data, dim = parse(data, dim) - if dim is not None: - raise ValueError("a scalar coordinate cannot be a dim") - if not self.__class__.isvalid(data): - raise TypeError("`data` must be scalar-like") - self.data = np.asarray(data, dtype=dtype) - - @property - def dim(self): - return None - - @dim.setter - def dim(self, value): - if value is not None: - raise ValueError("A scalar coordinate cannot have a `dim` other that None") - - @staticmethod - def isvalid(data): - data = np.asarray(data) - return (data.dtype != np.dtype(object)) and (data.ndim == 0) - - def isscalar(self): - return True - - def equals(self, other): - if isinstance(other, self.__class__): - return self.data == other.data - else: - return False - - def to_index(self, item, method=None, endpoint=True): - raise NotImplementedError("cannot get index of scalar coordinate") - - def to_dict(self): - if np.issubdtype(self.dtype, np.datetime64): - data = self.data.astype(str).item() - else: - data = self.data.item() - return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} - - class DefaultCoordinate(Coordinate): def __new__(cls, *args, **kwargs): return object.__new__(cls) diff --git a/xdas/core/coordinates/scalar.py b/xdas/core/coordinates/scalar.py new file mode 100644 index 00000000..ec30f660 --- /dev/null +++ b/xdas/core/coordinates/scalar.py @@ -0,0 +1,51 @@ +import numpy as np + +from .core import Coordinate, parse + + +class ScalarCoordinate(Coordinate): + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + if data is None: + raise TypeError("scalar coordinate cannot be empty, please provide a value") + data, dim = parse(data, dim) + if dim is not None: + raise ValueError("a scalar coordinate cannot be a dim") + if not self.__class__.isvalid(data): + raise TypeError("`data` must be scalar-like") + self.data = np.asarray(data, dtype=dtype) + + @property + def dim(self): + return None + + @dim.setter + def dim(self, value): + if value is not None: + raise ValueError("A scalar coordinate cannot have a `dim` other that None") + + @staticmethod + def isvalid(data): + data = np.asarray(data) + return (data.dtype != np.dtype(object)) and (data.ndim == 0) + + def isscalar(self): + return True + + def equals(self, other): + if isinstance(other, self.__class__): + return self.data == other.data + else: + return False + + def to_index(self, item, method=None, endpoint=True): + raise NotImplementedError("cannot get index of scalar coordinate") + + def to_dict(self): + if np.issubdtype(self.dtype, np.datetime64): + data = self.data.astype(str).item() + else: + data = self.data.item() + return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} From e695ab93b0db9940c6429f06ea1fb515610c2aa9 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 12:18:29 +0100 Subject: [PATCH 038/104] Refactoring: separate default module for DefaultCoordinate. --- xdas/core/coordinates/__init__.py | 2 +- xdas/core/coordinates/core.py | 79 ----------------------------- xdas/core/coordinates/default.py | 84 +++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 80 deletions(-) create mode 100644 xdas/core/coordinates/default.py diff --git a/xdas/core/coordinates/__init__.py b/xdas/core/coordinates/__init__.py index c6a3ca11..09f17353 100644 --- a/xdas/core/coordinates/__init__.py +++ b/xdas/core/coordinates/__init__.py @@ -1,9 +1,9 @@ from .core import ( Coordinate, Coordinates, - DefaultCoordinate, get_sampling_interval, ) +from .default import DefaultCoordinate from .dense import DenseCoordinate from .interp import InterpCoordinate from .sampled import SampledCoordinate diff --git a/xdas/core/coordinates/core.py b/xdas/core/coordinates/core.py index 64c3e980..a7c011b3 100644 --- a/xdas/core/coordinates/core.py +++ b/xdas/core/coordinates/core.py @@ -462,85 +462,6 @@ def from_dataset(cls, dataset, name): return coords -class DefaultCoordinate(Coordinate): - def __new__(cls, *args, **kwargs): - return object.__new__(cls) - - def __init__(self, data=None, dim=None, dtype=None): - if data is None: - data = {"size": 0} - data, dim = parse(data, dim) - if not self.isvalid(data): - raise TypeError("`data` must be a mapping {'size': }") - if dtype is not None: - raise ValueError("`dtype` is not supported for DefaultCoordinate") - self.data = data - self.dim = dim - - def __len__(self): - if self.data["size"] is None: - return 0 - else: - return self.data["size"] - - def __getitem__(self, item): - data = self.__array__()[item] - if ScalarCoordinate.isvalid(data): - return ScalarCoordinate(data) - else: - return Coordinate(data, self.dim) - - def __array__(self, dtype=None): - return np.arange(self.data["size"], dtype=dtype) - - @staticmethod - def isvalid(data): - match data: - case {"size": None | int(_)}: - return True - case _: - return False - - def isdefault(self): - return True - - @property - def empty(self): - return bool(self.data["size"]) - - @property - def dtype(self): - return np.int64 - - @property - def ndim(self): - return 1 - - @property - def shape(self): - return (len(self),) - - def equals(self, other): - if isinstance(other, self.__class__): - return self.data["size"] == other.data["size"] - - def get_indexer(self, value, method=None): - return value - - def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): - return slice(start, stop, step) - - def append(self, other): - if not isinstance(other, self.__class__): - raise TypeError(f"cannot append {type(other)} to {self.__class__}") - if not self.dim == other.dim: - raise ValueError("cannot append coordinate with different dimension") - return self.__class__({"size": len(self) + len(other)}, self.dim) - - def to_dict(self): - return {"dim": self.dim, "data": self.data.tolist(), "dtype": str(self.dtype)} - - def parse(data, dim=None): if isinstance(data, tuple): if dim is None: diff --git a/xdas/core/coordinates/default.py b/xdas/core/coordinates/default.py new file mode 100644 index 00000000..960db620 --- /dev/null +++ b/xdas/core/coordinates/default.py @@ -0,0 +1,84 @@ +import numpy as np + +from .core import Coordinate, parse + + +class DefaultCoordinate(Coordinate): + def __new__(cls, *args, **kwargs): + return object.__new__(cls) + + def __init__(self, data=None, dim=None, dtype=None): + if data is None: + data = {"size": 0} + data, dim = parse(data, dim) + if not self.isvalid(data): + raise TypeError("`data` must be a mapping {'size': }") + if dtype is not None: + raise ValueError("`dtype` is not supported for DefaultCoordinate") + self.data = data + self.dim = dim + + def __len__(self): + if self.data["size"] is None: + return 0 + else: + return self.data["size"] + + def __getitem__(self, item): + from .scalar import ScalarCoordinate + + data = self.__array__()[item] + if ScalarCoordinate.isvalid(data): + return ScalarCoordinate(data) + else: + return Coordinate(data, self.dim) + + def __array__(self, dtype=None): + return np.arange(self.data["size"], dtype=dtype) + + @staticmethod + def isvalid(data): + match data: + case {"size": None | int(_)}: + return True + case _: + return False + + def isdefault(self): + return True + + @property + def empty(self): + return bool(self.data["size"]) + + @property + def dtype(self): + return np.int64 + + @property + def ndim(self): + return 1 + + @property + def shape(self): + return (len(self),) + + def equals(self, other): + if isinstance(other, self.__class__): + return self.data["size"] == other.data["size"] + + def get_indexer(self, value, method=None): + return value + + def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): + return slice(start, stop, step) + + def append(self, other): + if not isinstance(other, self.__class__): + raise TypeError(f"cannot append {type(other)} to {self.__class__}") + if not self.dim == other.dim: + raise ValueError("cannot append coordinate with different dimension") + return self.__class__({"size": len(self) + len(other)}, self.dim) + + def to_dict(self): + return {"dim": self.dim, "data": self.data.tolist(), "dtype": str(self.dtype)} From b2c16cc49567ea9a15f67467119819387628eea1 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 12:23:29 +0100 Subject: [PATCH 039/104] Add Coordinate.isscalar. --- xdas/core/coordinates/core.py | 13 +++++++------ xdas/core/coordinates/default.py | 9 +++------ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/xdas/core/coordinates/core.py b/xdas/core/coordinates/core.py index a7c011b3..967dbc54 100644 --- a/xdas/core/coordinates/core.py +++ b/xdas/core/coordinates/core.py @@ -258,13 +258,9 @@ def __new__(cls, data=None, dim=None, dtype=None): raise TypeError("could not parse `data`") def __getitem__(self, item): - from .scalar import ScalarCoordinate - data = self.data.__getitem__(item) - if ScalarCoordinate.isvalid(data): - return ScalarCoordinate(data) - else: - return Coordinate(data, self.dim) + dim = None if isscalar(data) else self.dim + return Coordinate(data, dim) def __len__(self): return self.data.__len__() @@ -515,6 +511,11 @@ def get_sampling_interval(da, dim, cast=True): return d +def isscalar(data): + data = np.asarray(data) + return (data.dtype != np.dtype(object)) and (data.ndim == 0) + + def is_strictly_increasing(x): if np.issubdtype(x.dtype, np.datetime64): return np.all(np.diff(x) > np.timedelta64(0, "ns")) diff --git a/xdas/core/coordinates/default.py b/xdas/core/coordinates/default.py index 960db620..d52c9d95 100644 --- a/xdas/core/coordinates/default.py +++ b/xdas/core/coordinates/default.py @@ -1,6 +1,6 @@ import numpy as np -from .core import Coordinate, parse +from .core import Coordinate, isscalar, parse class DefaultCoordinate(Coordinate): @@ -25,13 +25,10 @@ def __len__(self): return self.data["size"] def __getitem__(self, item): - from .scalar import ScalarCoordinate data = self.__array__()[item] - if ScalarCoordinate.isvalid(data): - return ScalarCoordinate(data) - else: - return Coordinate(data, self.dim) + dim = None if isscalar(data) else self.dim + return Coordinate(data, dim) def __array__(self, dtype=None): return np.arange(self.data["size"], dtype=dtype) From 5a9146bdec16ed0d1adb31c30cf32a833a7f4596 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 15:26:18 +0100 Subject: [PATCH 040/104] Refactor get_sampling_ingerval: now a method of each Coordinate subclass. --- xdas/core/coordinates/core.py | 31 +++++++++++-------------------- xdas/core/coordinates/default.py | 4 +++- xdas/core/coordinates/interp.py | 13 +++++++++++++ xdas/core/coordinates/sampled.py | 3 +++ xdas/core/coordinates/scalar.py | 3 +++ xdas/core/routines.py | 4 ++-- 6 files changed, 35 insertions(+), 23 deletions(-) diff --git a/xdas/core/coordinates/core.py b/xdas/core/coordinates/core.py index 967dbc54..f70c36cd 100644 --- a/xdas/core/coordinates/core.py +++ b/xdas/core/coordinates/core.py @@ -323,6 +323,15 @@ def name(self): return self.dim return next((name for name in self.parent if self.parent[name] is self), None) + def get_sampling_interval(self, cast=True): + if len(self) < 2: + return None + delta = (self[-1].values - self[0].values) / (len(self) - 1) + delta = np.asarray(delta) # TODO: why? + if cast and np.issubdtype(delta.dtype, np.timedelta64): + delta = delta / np.timedelta64(1, "s") + return delta + def isdim(self): if self.parent is None or self.name is None: return None @@ -488,27 +497,9 @@ def get_sampling_interval(da, dim, cast=True): ------- float The sample spacing. + """ - if da.sizes[dim] < 2: - raise ValueError( - "cannot compute sample spacing on a dimension with less than 2 points" - ) - coord = da[dim] - if coord.isinterp(): - num = np.diff(coord.tie_values) - den = np.diff(coord.tie_indices) - mask = den != 1 - num = num[mask] - den = den[mask] - d = np.median(num / den) - elif coord.issampled(): - d = coord.sampling_interval - else: - d = (coord[-1].values - coord[0].values) / (len(coord) - 1) - d = np.asarray(d) - if cast and np.issubdtype(d.dtype, np.timedelta64): - d = d / np.timedelta64(1, "s") - return d + return da[dim].get_sampling_interval(cast=cast) def isscalar(data): diff --git a/xdas/core/coordinates/default.py b/xdas/core/coordinates/default.py index d52c9d95..8b11220b 100644 --- a/xdas/core/coordinates/default.py +++ b/xdas/core/coordinates/default.py @@ -25,7 +25,6 @@ def __len__(self): return self.data["size"] def __getitem__(self, item): - data = self.__array__()[item] dim = None if isscalar(data) else self.dim return Coordinate(data, dim) @@ -60,6 +59,9 @@ def ndim(self): def shape(self): return (len(self),) + def get_sampling_interval(self, cast=True): + return 1 + def equals(self, other): if isinstance(other, self.__class__): return self.data["size"] == other.data["size"] diff --git a/xdas/core/coordinates/interp.py b/xdas/core/coordinates/interp.py index 5e60be34..6a3f7f60 100644 --- a/xdas/core/coordinates/interp.py +++ b/xdas/core/coordinates/interp.py @@ -164,6 +164,19 @@ def values(self): else: return self.get_value(self.indices) + def get_sampling_interval(self, cast=True): + if len(self) < 2: + return None + num = np.diff(self.tie_values) + den = np.diff(self.tie_indices) + mask = den != 1 + num = num[mask] + den = den[mask] + delta = np.median(num / den) + if cast and np.issubdtype(delta.dtype, np.timedelta64): + delta = delta / np.timedelta64(1, "s") + return delta + def equals(self, other): return ( np.array_equal(self.tie_indices, other.tie_indices) diff --git a/xdas/core/coordinates/sampled.py b/xdas/core/coordinates/sampled.py index 70b2e0a4..28fead89 100644 --- a/xdas/core/coordinates/sampled.py +++ b/xdas/core/coordinates/sampled.py @@ -112,6 +112,9 @@ def isvalid(data): def issampled(self): return True + def get_sampling_interval(self, cast=True): + return self.sampling_interval + def __len__(self): if self.empty: return 0 diff --git a/xdas/core/coordinates/scalar.py b/xdas/core/coordinates/scalar.py index ec30f660..f0cda938 100644 --- a/xdas/core/coordinates/scalar.py +++ b/xdas/core/coordinates/scalar.py @@ -26,6 +26,9 @@ def dim(self, value): if value is not None: raise ValueError("A scalar coordinate cannot have a `dim` other that None") + def get_sampling_interval(self, cast=True): + return None + @staticmethod def isvalid(data): data = np.asarray(data) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index bc9927f3..187c88d8 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -612,9 +612,9 @@ def initialize(self, da): if self.dim in self.dims else da.coords.drop_coords(self.dim) ) - try: + if self.dim in da.coords: self.delta = get_sampling_interval(da, self.dim) - except (ValueError, KeyError): + else: self.delta = None self.dtype = da.dtype From bcddaf15187d282359c9c36b5dfc1b36e7addc61 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 16:14:17 +0100 Subject: [PATCH 041/104] Massive paths refactoring. --- tests/{ => coordinates}/test_coordinates.py | 88 +----------------- .../test_sampled.py} | 9 +- tests/coordinates/test_scalar.py | 90 +++++++++++++++++++ tests/test_core.py | 83 ++++++++--------- tests/test_dataarray.py | 85 +++++++++--------- tests/test_datacollection.py | 32 +++---- tests/test_processing.py | 26 +++--- tests/test_routines.py | 90 ++++++++++--------- tests/test_virtual.py | 10 +-- tests/test_xarray.py | 14 +-- xdas/__init__.py | 7 +- xdas/atoms/ml.py | 2 +- xdas/atoms/signal.py | 2 +- xdas/{core => }/coordinates/__init__.py | 0 xdas/{core => }/coordinates/core.py | 2 +- xdas/{core => }/coordinates/default.py | 0 xdas/{core => }/coordinates/dense.py | 0 xdas/{core => }/coordinates/interp.py | 0 xdas/{core => }/coordinates/sampled.py | 0 xdas/{core => }/coordinates/scalar.py | 0 xdas/core/dataarray.py | 4 +- xdas/core/routines.py | 3 +- xdas/fft.py | 2 +- xdas/io/__init__.py | 1 - xdas/io/asn.py | 3 +- xdas/io/miniseed.py | 2 +- xdas/processing/__init__.py | 1 - xdas/signal.py | 2 +- xdas/spectral.py | 2 +- xdas/trigger.py | 2 +- 30 files changed, 281 insertions(+), 281 deletions(-) rename tests/{ => coordinates}/test_coordinates.py (89%) rename tests/{test_sampled_coordinate.py => coordinates/test_sampled.py} (99%) create mode 100644 tests/coordinates/test_scalar.py rename xdas/{core => }/coordinates/__init__.py (100%) rename xdas/{core => }/coordinates/core.py (99%) rename xdas/{core => }/coordinates/default.py (100%) rename xdas/{core => }/coordinates/dense.py (100%) rename xdas/{core => }/coordinates/interp.py (100%) rename xdas/{core => }/coordinates/sampled.py (100%) rename xdas/{core => }/coordinates/scalar.py (100%) diff --git a/tests/test_coordinates.py b/tests/coordinates/test_coordinates.py similarity index 89% rename from tests/test_coordinates.py rename to tests/coordinates/test_coordinates.py index 0b295fa7..2a3166a1 100644 --- a/tests/test_coordinates.py +++ b/tests/coordinates/test_coordinates.py @@ -3,93 +3,7 @@ import pytest import xdas -from xdas.core.coordinates import DenseCoordinate, InterpCoordinate, ScalarCoordinate - - -class TestScalarCoordinate: - valid = [ - 1, - np.array(1), - 1.0, - np.array(1.0), - "label", - np.array("label"), - np.datetime64(1, "s"), - ] - invalid = [[1], np.array([1]), {"key": "value"}] - - def test_isvalid(self): - for data in self.valid: - assert ScalarCoordinate.isvalid(data) - for data in self.invalid: - assert not ScalarCoordinate.isvalid(data) - - def test_init(self): - coord = ScalarCoordinate(1) - assert coord.data == 1 - assert coord.dim is None - coord = ScalarCoordinate(1, None) - assert coord.dim is None - with pytest.raises(ValueError): - ScalarCoordinate(1, "dim") - for data in self.valid: - assert ScalarCoordinate(data).data == np.array(data) - for data in self.invalid: - with pytest.raises(TypeError): - ScalarCoordinate(data) - - def test_getitem(self): - assert ScalarCoordinate(1)[...].equals(ScalarCoordinate(1)) - with pytest.raises(IndexError): - ScalarCoordinate(1)[:] - with pytest.raises(IndexError): - ScalarCoordinate(1)[0] - - def test_len(self): - with pytest.raises(TypeError): - len(ScalarCoordinate(1)) - - def test_repr(self): - for data in self.valid: - assert ScalarCoordinate(data).__repr__() == np.array2string( - np.asarray(data), threshold=0, edgeitems=1 - ) - - def test_array(self): - for data in self.valid: - assert ScalarCoordinate(data).__array__() == np.array(data) - - def test_dtype(self): - for data in self.valid: - assert ScalarCoordinate(data).dtype == np.array(data).dtype - - def test_values(self): - for data in self.valid: - assert ScalarCoordinate(data).values == np.array(data) - - def test_equals(self): - for data in self.valid: - coord = ScalarCoordinate(data) - assert coord.equals(coord) - assert ScalarCoordinate(1).equals(ScalarCoordinate(np.array(1))) - - def test_to_index(self): - with pytest.raises(NotImplementedError): - ScalarCoordinate(1).to_index("item") - - def test_isinstance(self): - assert ScalarCoordinate(1).isscalar() - assert not ScalarCoordinate(1).isdense() - assert not ScalarCoordinate(1).isinterp() - - def test_to_from_dict(self): - for data in self.valid: - coord = ScalarCoordinate(data) - assert ScalarCoordinate.from_dict(coord.to_dict()).equals(coord) - - def test_empty(self): - with pytest.raises(TypeError, match="cannot be empty"): - ScalarCoordinate() +from xdas.coordinates import DenseCoordinate, InterpCoordinate, ScalarCoordinate class TestDenseCoordinate: diff --git a/tests/test_sampled_coordinate.py b/tests/coordinates/test_sampled.py similarity index 99% rename from tests/test_sampled_coordinate.py rename to tests/coordinates/test_sampled.py index 643485dc..14ea457d 100644 --- a/tests/test_sampled_coordinate.py +++ b/tests/coordinates/test_sampled.py @@ -2,7 +2,12 @@ import pandas as pd import pytest -from xdas.core.coordinates import DenseCoordinate, SampledCoordinate, ScalarCoordinate +from xdas.coordinates import ( + Coordinate, + DenseCoordinate, + SampledCoordinate, + ScalarCoordinate, +) class TestSampledCoordinateBasics: @@ -355,8 +360,6 @@ def test_to_from_dict(self): ) d = coord.to_dict() # round-trip via Coordinate factory - from xdas.core.coordinates import Coordinate - back = Coordinate.from_dict(d) assert isinstance(back, SampledCoordinate) assert back.equals(coord) diff --git a/tests/coordinates/test_scalar.py b/tests/coordinates/test_scalar.py new file mode 100644 index 00000000..9060f60d --- /dev/null +++ b/tests/coordinates/test_scalar.py @@ -0,0 +1,90 @@ +import numpy as np +import pytest + +from xdas.coordinates import ScalarCoordinate + + +class TestScalarCoordinate: + valid = [ + 1, + np.array(1), + 1.0, + np.array(1.0), + "label", + np.array("label"), + np.datetime64(1, "s"), + ] + invalid = [[1], np.array([1]), {"key": "value"}] + + def test_isvalid(self): + for data in self.valid: + assert ScalarCoordinate.isvalid(data) + for data in self.invalid: + assert not ScalarCoordinate.isvalid(data) + + def test_init(self): + coord = ScalarCoordinate(1) + assert coord.data == 1 + assert coord.dim is None + coord = ScalarCoordinate(1, None) + assert coord.dim is None + with pytest.raises(ValueError): + ScalarCoordinate(1, "dim") + for data in self.valid: + assert ScalarCoordinate(data).data == np.array(data) + for data in self.invalid: + with pytest.raises(TypeError): + ScalarCoordinate(data) + + def test_getitem(self): + assert ScalarCoordinate(1)[...].equals(ScalarCoordinate(1)) + with pytest.raises(IndexError): + ScalarCoordinate(1)[:] + with pytest.raises(IndexError): + ScalarCoordinate(1)[0] + + def test_len(self): + with pytest.raises(TypeError): + len(ScalarCoordinate(1)) + + def test_repr(self): + for data in self.valid: + assert ScalarCoordinate(data).__repr__() == np.array2string( + np.asarray(data), threshold=0, edgeitems=1 + ) + + def test_array(self): + for data in self.valid: + assert ScalarCoordinate(data).__array__() == np.array(data) + + def test_dtype(self): + for data in self.valid: + assert ScalarCoordinate(data).dtype == np.array(data).dtype + + def test_values(self): + for data in self.valid: + assert ScalarCoordinate(data).values == np.array(data) + + def test_equals(self): + for data in self.valid: + coord = ScalarCoordinate(data) + assert coord.equals(coord) + assert ScalarCoordinate(1).equals(ScalarCoordinate(np.array(1))) + + def test_to_index(self): + with pytest.raises(NotImplementedError): + ScalarCoordinate(1).to_index("item") + + def test_isinstance(self): + assert ScalarCoordinate(1).isscalar() + assert not ScalarCoordinate(1).isdense() + assert not ScalarCoordinate(1).isinterp() + + def test_to_from_dict(self): + for data in self.valid: + coord = ScalarCoordinate(data) + assert ScalarCoordinate.from_dict(coord.to_dict()).equals(coord) + + def test_empty(self): + with pytest.raises(TypeError, match="cannot be empty"): + ScalarCoordinate() diff --git a/tests/test_core.py b/tests/test_core.py index aeb5c2de..51c33b2f 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -4,7 +4,8 @@ import numpy as np import pytest -import xdas +import xdas as xd +from xdas.coordinates import InterpCoordinate from xdas.synthetics import wavelet_wavefronts from xdas.virtual import VirtualStack @@ -20,7 +21,7 @@ def generate(self, datetime): else: t = {"tie_indices": [0, shape[0] - 1], "tie_values": [0, 3.0 - 1 / 100]} s = {"tie_indices": [0, shape[1] - 1], "tie_values": [0, 990.0]} - return xdas.DataArray( + return xd.DataArray( data=np.random.randn(*shape), coords={ "time": t, @@ -39,7 +40,7 @@ def test_open_mfdatatree(self): for idx, da in enumerate(wavelet_wavefronts(nchunk=3), start=1): da.to_netcdf(os.path.join(dirname, f"{idx:03d}.nc")) da = wavelet_wavefronts() - dc = xdas.open_mfdatatree( + dc = xd.open_mfdatatree( os.path.join(dirpath, "{node}", "00[acquisition].nc") ) assert list(dc.keys()) == keys @@ -51,10 +52,10 @@ def test_open_mfdataarray(self): wavelet_wavefronts().to_netcdf(os.path.join(dirpath, "sample.nc")) for idx, da in enumerate(wavelet_wavefronts(nchunk=3), start=1): da.to_netcdf(os.path.join(dirpath, f"{idx:03}.nc")) - da_monolithic = xdas.open_dataarray(os.path.join(dirpath, "sample.nc")) - da_chunked = xdas.open_mfdataarray(os.path.join(dirpath, "00*.nc")) + da_monolithic = xd.open_dataarray(os.path.join(dirpath, "sample.nc")) + da_chunked = xd.open_mfdataarray(os.path.join(dirpath, "00*.nc")) assert da_monolithic.equals(da_chunked) - da_chunked = xdas.open_mfdataarray( + da_chunked = xd.open_mfdataarray( [ os.path.join(dirpath, fname) for fname in ["001.nc", "002.nc", "003.nc"] @@ -62,9 +63,9 @@ def test_open_mfdataarray(self): ) assert da_monolithic.equals(da_chunked) with pytest.raises(FileNotFoundError): - xdas.open_mfdataarray("not_existing_files_*.nc") + xd.open_mfdataarray("not_existing_files_*.nc") with pytest.raises(FileNotFoundError): - xdas.open_mfdataarray(["not_existing_file.nc"]) + xd.open_mfdataarray(["not_existing_file.nc"]) def test_open_mfdataarray_grouping(self): with TemporaryDirectory() as dirpath: @@ -90,7 +91,7 @@ def test_open_mfdataarray_grouping(self): for da in wavelet_wavefronts(**acq): da.to_netcdf(os.path.join(dirpath, f"{count:03d}.nc")) count += 1 - dc = xdas.open_mfdataarray(os.path.join(dirpath, "*.nc")) + dc = xd.open_mfdataarray(os.path.join(dirpath, "*.nc")) assert len(dc) == 3 for da, acq in zip(dc, acqs): acq |= {"nchunk": None} @@ -108,28 +109,28 @@ def test_concatenate(self): }, "distance": da1["distance"], } - expected = xdas.DataArray(data, coords) - result = xdas.concatenate([da1, da2]) + expected = xd.DataArray(data, coords) + result = xd.concatenate([da1, da2]) assert result.equals(expected) # concatenate an empty data array - result = xdas.concatenate([da1, da2.isel(time=slice(0, 0))]) + result = xd.concatenate([da1, da2.isel(time=slice(0, 0))]) assert result.equals(da1) # concat of sources and stacks with TemporaryDirectory() as tmp_path: da1.to_netcdf(os.path.join(tmp_path, "da1.nc")) da2.to_netcdf(os.path.join(tmp_path, "da2.nc")) - da1 = xdas.open_dataarray(os.path.join(tmp_path, "da1.nc")) - da2 = xdas.open_dataarray(os.path.join(tmp_path, "da2.nc")) - result = xdas.concatenate([da1, da2]) + da1 = xd.open_dataarray(os.path.join(tmp_path, "da1.nc")) + da2 = xd.open_dataarray(os.path.join(tmp_path, "da2.nc")) + result = xd.concatenate([da1, da2]) assert isinstance(result.data, VirtualStack) assert result.equals(expected) da1.data = VirtualStack([da1.data]) da2.data = VirtualStack([da2.data]) - result = xdas.concatenate([da1, da2]) + result = xd.concatenate([da1, da2]) assert isinstance(result.data, VirtualStack) assert result.equals(expected) # concat of 3D data arrays with unsorted coords: - da1 = xdas.DataArray( + da1 = xd.DataArray( data=np.zeros((5, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -138,7 +139,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - da2 = xdas.DataArray( + da2 = xd.DataArray( data=np.ones((7, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -147,7 +148,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - expected = xdas.DataArray( + expected = xd.DataArray( data=np.concatenate((np.zeros((5, 4, 3)), np.ones((7, 4, 3))), axis=0), coords={ "time": {"tie_indices": [0, 11], "tie_values": [0, 11]}, @@ -155,9 +156,9 @@ def test_concatenate(self): "phase": ["A", "B", "C"], }, ) - assert xdas.concatenate((da1, da2), dim="time").equals(expected) + assert xd.concatenate((da1, da2), dim="time").equals(expected) # concat dense coordinates - da1 = xdas.DataArray( + da1 = xd.DataArray( data=np.zeros((5, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -166,7 +167,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - da2 = xdas.DataArray( + da2 = xd.DataArray( data=np.ones((7, 4, 3)), coords={ "phase": ["A", "B", "C"], @@ -175,7 +176,7 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - expected = xdas.DataArray( + expected = xd.DataArray( data=np.concatenate((np.zeros((5, 4, 3)), np.ones((7, 4, 3))), axis=0), coords={ "phase": ["A", "B", "C"], @@ -184,34 +185,34 @@ def test_concatenate(self): }, dims=("time", "distance", "phase"), ) - assert xdas.concatenate((da1, da2), dim="time").equals(expected) + assert xd.concatenate((da1, da2), dim="time").equals(expected) # stack da = wavelet_wavefronts() objs = [obj for obj in da] - result = xdas.concatenate(objs, dim="time") - result["time"] = xdas.InterpCoordinate.from_array(result["time"].values) + result = xd.concatenate(objs, dim="time") + result["time"] = InterpCoordinate.from_array(result["time"].values) assert result.equals(da) objs = [obj.drop_coords("time") for obj in da] - result = xdas.concatenate(objs, dim="time") + result = xd.concatenate(objs, dim="time") assert result.equals(da.drop_coords("time")) def test_open_dataarray(self): with pytest.raises(FileNotFoundError): - xdas.open_dataarray("not_existing_file.nc") + xd.open_dataarray("not_existing_file.nc") def test_open_datacollection(self): with pytest.raises(FileNotFoundError): - xdas.open_datacollection("not_existing_file.nc") + xd.open_datacollection("not_existing_file.nc") def test_asdataarray(self): da = self.generate(False) - out = xdas.asdataarray(da.to_xarray()) + out = xd.asdataarray(da.to_xarray()) assert np.array_equal(out.data, da.data) for dim in da.dims: assert np.array_equal(out[dim].values, da[dim].values) def test_split(self): - da = xdas.DataArray( + da = xd.DataArray( np.ones(30), { "time": { @@ -220,22 +221,22 @@ def test_split(self): }, }, ) - assert xdas.concatenate(xdas.split(da)).equals(da) - assert xdas.split(da, tolerance=20.0)[0].equals(da) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) def test_chunk(self): da = wavelet_wavefronts() - assert xdas.concatenate(xdas.split(da, 3)).equals(da) + assert xd.concatenate(xd.split(da, 3)).equals(da) def test_align(self): - da1 = xdas.DataArray(np.arange(2), {"x": [0, 1]}) - da2 = xdas.DataArray(np.arange(3), {"y": [2, 3, 4]}) - da1, da2 = xdas.align(da1, da2) + da1 = xd.DataArray(np.arange(2), {"x": [0, 1]}) + da2 = xd.DataArray(np.arange(3), {"y": [2, 3, 4]}) + da1, da2 = xd.align(da1, da2) assert da1.sizes == {"x": 2, "y": 1} assert da2.sizes == {"x": 1, "y": 3} - da3 = xdas.DataArray(np.arange(4).reshape(2, 2), {"x": [0, 1], "y": [2, 3]}) + da3 = xd.DataArray(np.arange(4).reshape(2, 2), {"x": [0, 1], "y": [2, 3]}) with pytest.raises(ValueError, match="incompatible sizes"): - xdas.align(da1, da2, da3) - da3 = xdas.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]}) + xd.align(da1, da2, da3) + da3 = xd.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]}) with pytest.raises(ValueError, match="differs from one data array to another"): - xdas.align(da1, da2, da3) + xd.align(da1, da2, da3) diff --git a/tests/test_dataarray.py b/tests/test_dataarray.py index a0a9c0bf..4a251f7e 100644 --- a/tests/test_dataarray.py +++ b/tests/test_dataarray.py @@ -7,9 +7,8 @@ import numpy as np import pytest -import xdas -from xdas.core.coordinates import Coordinates, DenseCoordinate, InterpCoordinate -from xdas.core.dataarray import DataArray +import xdas as xd +from xdas.coordinates import Coordinates, DenseCoordinate, InterpCoordinate from xdas.synthetics import wavelet_wavefronts @@ -20,12 +19,12 @@ def generate(self, dense=False): else: coords = {"dim": {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}} data = 0.1 * np.arange(9) - da = xdas.DataArray(data, coords) + da = xd.DataArray(data, coords) return da def test_init_without_coords(self): data = np.arange(2 * 3 * 5).reshape(2, 3, 5) - da = xdas.DataArray(data) + da = xd.DataArray(data) assert np.array_equal(da.data, data) assert da.dims == ("dim_0", "dim_1", "dim_2") assert da.coords == {} @@ -53,30 +52,30 @@ def test_init_and_properties(self): assert da.dtype == np.float64 da = self.generate(dense=True) assert isinstance(da["dim"], DenseCoordinate) - da = DataArray() + da = xd.DataArray() assert np.array_equal(da.values, np.array(np.nan), equal_nan=True) assert da.coords == {} assert da.dims == tuple() - da = DataArray([[]]) + da = xd.DataArray([[]]) assert da.dims == ("dim_0", "dim_1") assert da.ndim == 2 - da = DataArray(1) + da = xd.DataArray(1) assert da.dims == tuple() assert da.ndim == 0 def test_raises_on_data_and_coords_mismatch(self): with pytest.raises(ValueError, match="different number of dimensions"): - DataArray(np.zeros(3), dims=("time", "distance")) + xd.DataArray(np.zeros(3), dims=("time", "distance")) with pytest.raises( ValueError, match="inferred number of dimensions 2 from `coords` does not match `data` dimensionality of 1", ): - DataArray(np.zeros(3), coords={"time": [1], "distance": [1]}) + xd.DataArray(np.zeros(3), coords={"time": [1], "distance": [1]}) with pytest.raises(ValueError, match="conflicting sizes for dimension"): - DataArray(np.zeros((2, 3)), coords={"time": [1, 2], "distance": [1, 2]}) + xd.DataArray(np.zeros((2, 3)), coords={"time": [1, 2], "distance": [1, 2]}) def test_coords_setter(self): - da = xdas.DataArray(np.arange(3 * 11).reshape(3, 11)) + da = xd.DataArray(np.arange(3 * 11).reshape(3, 11)) da["dim_0"] = [1, 2, 4] da["dim_1"] = {"tie_indices": [0, 10], "tie_values": [0.0, 100.0]} da["dim_0"] = [1, 2, 3] @@ -163,7 +162,7 @@ def test_sel(self): assert "distance" not in result.coords def test_better_error_when_sel_with_overlaps(self): - da = DataArray( + da = xd.DataArray( np.arange(80).reshape(20, 4), { "time": { @@ -195,7 +194,7 @@ def test_to_xarray(self): result = da.to_xarray() assert np.array_equal(result.values, da.values) assert np.array_equal(result["dim"].values, da["dim"].values) - da = da.sel(dim=slice(1000, 2000)) # empty dataarray + da = da.sel(dim=slice(1000, 2000)) # empty xd.dataarray result = da.to_xarray() assert np.array_equal(result.values, da.values) assert np.array_equal(result["dim"].values, da["dim"].values) @@ -203,7 +202,7 @@ def test_to_xarray(self): def test_from_xarray(self): da = self.generate() da = da.to_xarray() - result = DataArray.from_xarray(da) + result = xd.DataArray.from_xarray(da) assert np.array_equal(result.values, da.values) assert np.array_equal(result["dim"].values, da["dim"].values) @@ -219,7 +218,7 @@ def test_stream(self): assert st[0].stats.npts == da.sizes["time"] assert np.datetime64(st[0].stats.starttime.datetime) == da["time"][0].values assert np.datetime64(st[0].stats.endtime.datetime) == da["time"][-1].values - result = DataArray.from_stream(st) + result = xd.DataArray.from_stream(st) assert np.array_equal(result.values.T, da.values) assert result.sizes == { "channel": da.sizes["distance"], @@ -231,10 +230,10 @@ def test_dense_str(self): coord = [f"D{k}" for k in range(9)] coords = Coordinates({"dim": coord}) data = 0.1 * np.arange(9) - DataArray(data, coords) + xd.DataArray(data, coords) def test_single_index_selection(self): - da = DataArray( + da = xd.DataArray( np.arange(12).reshape(3, 4), { "time": {"tie_values": [0.0, 1.0], "tie_indices": [0, 2]}, @@ -244,7 +243,7 @@ def test_single_index_selection(self): da_getitem = da[1] da_isel = da.isel(time=1) da_sel = da.sel(time=0.5) - da_expected = DataArray( + da_expected = xd.DataArray( np.array([4, 5, 6, 7]), {"time": (None, 0.5), "distance": [0.0, 10.0, 20.0, 30.0]}, ) @@ -254,7 +253,7 @@ def test_single_index_selection(self): da_getitem = da[:, 1] da_isel = da.isel(distance=1) da_sel = da.sel(distance=10.0) - da_expected = DataArray( + da_expected = xd.DataArray( np.array([1, 5, 9]), { "time": {"tie_values": [0.0, 1.0], "tie_indices": [0, 2]}, @@ -266,7 +265,7 @@ def test_single_index_selection(self): assert da_sel.equals(da_expected) def test_assign_coords(self): - da = DataArray( + da = xd.DataArray( data=np.zeros(3), coords={"time": np.array([3, 4, 5])}, ) @@ -277,7 +276,7 @@ def test_assign_coords(self): assert np.array_equal(result["relative_time"].values, [0, 1, 2]) def test_swap_dims(self): - da = DataArray( + da = xd.DataArray( data=[0, 1], coords={"x": ["a", "b"], "y": ("x", [0, 1])}, ) @@ -294,7 +293,7 @@ def test_swap_dims(self): da.swap_dims({"z": "x"}) def test_to_xarray_non_dimensional(self): - da = DataArray( + da = xd.DataArray( data=np.zeros(3), coords={ "time": np.array([3, 4, 5]), @@ -308,7 +307,7 @@ def test_to_xarray_non_dimensional(self): assert result.dims == da.dims def test_netcdf_non_dimensional(self): - da = DataArray( + da = xd.DataArray( data=np.zeros(3), coords={ "time": np.array([3, 4, 5]), @@ -318,16 +317,16 @@ def test_netcdf_non_dimensional(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - result = xdas.open_dataarray(path) + result = xd.open_dataarray(path) assert result.equals(da) with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "da.nc") da = wavelet_wavefronts().assign_coords(lon=("distance", np.arange(401))) da.to_netcdf(path) - tmp = xdas.open_dataarray(path) + tmp = xd.open_dataarray(path) path = path = os.path.join(dirpath, "vds.nc") tmp.to_netcdf(path) - result = xdas.open_dataarray(path) + result = xd.open_dataarray(path) assert result.equals(da) def test_transpose(self): @@ -345,16 +344,16 @@ def test_transpose(self): da.transpose("space", "frequency") def test_expand_dims(self): - da = DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2]}) + da = xd.DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2]}) result = da.expand_dims("y", 0) assert result.dims == ("y", "x") assert result.shape == (1, 3) - da = DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2], "y": 0}, dims=("x",)) + da = xd.DataArray([1.0, 2.0, 3.0], {"x": [0, 1, 2], "y": 0}, dims=("x",)) result = da.expand_dims("y") assert result.dims == ("y", "x") assert result.shape == (1, 3) - assert result["y"].equals(xdas.Coordinate([0], dim="y")) + assert result["y"].equals(xd.Coordinate([0], dim="y")) def test_io(self): # both coords interpolated @@ -362,7 +361,7 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - da_recovered = DataArray.from_netcdf(path) + da_recovered = xd.DataArray.from_netcdf(path) assert da.equals(da_recovered) # mixed interpolated and dense @@ -370,7 +369,7 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - da_recovered = DataArray.from_netcdf(path) + da_recovered = xd.DataArray.from_netcdf(path) assert da.equals(da_recovered) # only dense coords @@ -378,11 +377,11 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - da_recovered = DataArray.from_netcdf(path) + da_recovered = xd.DataArray.from_netcdf(path) assert da.equals(da_recovered) def test_io_with_zfp_compression(self): - da = DataArray(np.random.rand(101, 101)) + da = xd.DataArray(np.random.rand(101, 101)) with TemporaryDirectory() as tmpdir: tmpfile_uncompressed = os.path.join(tmpdir, "uncompressed.nc") da.to_netcdf(tmpfile_uncompressed) @@ -398,7 +397,7 @@ def test_io_with_zfp_compression(self): chunk_compressed_size = os.path.getsize(tmpfile_chunk_compressed) assert chunk_compressed_size < uncompressed_size assert compressed_size < chunk_compressed_size - _da = DataArray.from_netcdf(tmpfile_compressed) + _da = xd.DataArray.from_netcdf(tmpfile_compressed) assert np.abs(da - _da).max().values < 0.001 def test_io_dask(self): @@ -414,7 +413,7 @@ def test_io_dask(self): for chunk in chunks ] data = dask.array.concatenate(chunks, axis=1) - expected = DataArray( + expected = xd.DataArray( data, coords={"time": np.arange(3), "distance": np.arange(10)}, attrs={"version": "1.0"}, @@ -422,7 +421,7 @@ def test_io_dask(self): ) fname = os.path.join(tmpdir, "tmp.nc") expected.to_netcdf(fname) - result = xdas.open_dataarray(fname) + result = xd.open_dataarray(fname) assert isinstance(result.data, dask.array.Array) assert np.array_equal(expected.values, result.values) assert expected.dtype == result.dtype @@ -432,16 +431,16 @@ def test_io_dask(self): assert expected.attrs == result.attrs def test_io_non_dimensional(self): - expected = DataArray(coords={"dim": 0}, dims=()) + expected = xd.DataArray(coords={"dim": 0}, dims=()) with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") expected.to_netcdf(path) - result = DataArray.from_netcdf(path) + result = xd.DataArray.from_netcdf(path) assert expected.equals(result) def test_io_attrs(self): attrs = {"description": "test"} - da = DataArray( + da = xd.DataArray( np.arange(3), coords={"time": np.array([3, 4, 5])}, attrs=attrs, @@ -449,13 +448,13 @@ def test_io_attrs(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") da.to_netcdf(path) - result = DataArray.from_netcdf(path) + result = xd.DataArray.from_netcdf(path) assert result.attrs == attrs assert result.equals(da) - da = xdas.open_dataarray(path) + da = xd.open_dataarray(path) path = os.path.join(dirpath, "vds.nc") da.to_netcdf(path) - result = xdas.open_dataarray(path) + result = xd.open_dataarray(path) assert result.attrs == attrs assert result.equals(da) diff --git a/tests/test_datacollection.py b/tests/test_datacollection.py index 1f61b64e..7ebd2c0f 100644 --- a/tests/test_datacollection.py +++ b/tests/test_datacollection.py @@ -4,7 +4,7 @@ import h5py import pytest -import xdas +import xdas as xd import xdas.signal as xs from xdas.core.datacollection import get_depth from xdas.synthetics import wavelet_wavefronts @@ -12,10 +12,10 @@ class TestDataCollection: def nest(self, da): - return xdas.DataCollection( + return xd.DataCollection( { - "das1": xdas.DataCollection([da, da], "acquisition"), - "das2": xdas.DataCollection([da, da, da], "acquisition"), + "das1": xd.DataCollection([da, da], "acquisition"), + "das2": xd.DataCollection([da, da, da], "acquisition"), }, "instrument", ) @@ -30,12 +30,12 @@ def test_init(self): "das2": ("acquisition", [da, da, da]), }, ) - result = xdas.DataCollection(data) + result = xd.DataCollection(data) assert result.equals(dc) def test_io(self): da = wavelet_wavefronts() - dc = xdas.DataCollection( + dc = xd.DataCollection( { "das1": da, "das2": da, @@ -45,27 +45,27 @@ def test_io(self): with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") dc.to_netcdf(path) - result = xdas.DataCollection.from_netcdf(path) + result = xd.DataCollection.from_netcdf(path) assert result.equals(dc) - dc = xdas.DataCollection([da, da], "instrument") + dc = xd.DataCollection([da, da], "instrument") with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") dc.to_netcdf(path) - result = xdas.DataCollection.from_netcdf(path) + result = xd.DataCollection.from_netcdf(path) assert result.equals(dc) - dc = xdas.DataCollection( + dc = xd.DataCollection( { - "das1": xdas.DataCollection([da, da], "acquisition"), - "das2": xdas.DataCollection([da, da, da], "acquisition"), + "das1": xd.DataCollection([da, da], "acquisition"), + "das2": xd.DataCollection([da, da, da], "acquisition"), }, "instrument", ) with TemporaryDirectory() as dirpath: path = os.path.join(dirpath, "tmp.nc") dc.to_netcdf(path) - result = xdas.DataCollection.from_netcdf(path) + result = xd.DataCollection.from_netcdf(path) assert result.equals(dc) - result = xdas.open_datacollection(path) + result = xd.open_datacollection(path) assert result.equals(dc) def test_depth_counter(self): @@ -108,9 +108,9 @@ def test_query(self): da = wavelet_wavefronts() dc = self.nest(da) result = dc.query(instrument="das1", acquisition=0) - expected = xdas.DataCollection( + expected = xd.DataCollection( { - "das1": xdas.DataCollection([da], "acquisition"), + "das1": xd.DataCollection([da], "acquisition"), }, "instrument", ) diff --git a/tests/test_processing.py b/tests/test_processing.py index 3acfb62a..5f72976b 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -10,10 +10,10 @@ import pandas as pd import scipy.signal as sp -import xdas +import xdas as xd import xdas.processing as xp from xdas.atoms import Partial, Sequential -from xdas.processing.core import ( +from xdas.processing import ( DataArrayLoader, DataArrayWriter, DataFrameWriter, @@ -31,7 +31,7 @@ def test_stateful(self): with tempfile.TemporaryDirectory() as tempdir: # generate test dataarray wavelet_wavefronts().to_netcdf(os.path.join(tempdir, "sample.nc")) - da = xdas.open_dataarray(os.path.join(tempdir, "sample.nc")) + da = xd.open_dataarray(os.path.join(tempdir, "sample.nc")) # declare processing sequence sos = sp.iirfilter(4, 0.1, btype="lowpass", output="sos") @@ -189,20 +189,20 @@ def publish(): result.append(packet) if n == len(packets): break - return xdas.concatenate(result) + return xd.concatenate(result) def test_publish_and_subscribe(self): - expected = xdas.synthetics.dummy() - packets = xdas.split(expected, 10) - address = f"tcp://localhost:{xdas.io.get_free_port()}" + expected = xd.synthetics.dummy() + packets = xd.split(expected, 10) + address = f"tcp://localhost:{xd.io.get_free_port()}" result = self._publish_and_subscribe(packets, address) assert result.equals(expected) def test_encoding(self): - expected = xdas.synthetics.dummy() - packets = xdas.split(expected, 10) - address = f"tcp://localhost:{xdas.io.get_free_port()}" + expected = xd.synthetics.dummy() + packets = xd.split(expected, 10) + address = f"tcp://localhost:{xd.io.get_free_port()}" encoding = {"chunks": (10, 10), **hdf5plugin.Zfp(accuracy=1e-6)} result = self._publish_and_subscribe(packets, address, encoding=encoding) @@ -221,7 +221,7 @@ def test_without_gap(self): endtime = starttime + np.timedelta64(10, "ms") * (data.shape[0] - 1) distance = 5.0 * np.arange(data.shape[1]) - da = xdas.DataArray( + da = xd.DataArray( data=data, coords={ "time": { @@ -275,7 +275,7 @@ def test_without_gap(self): def test_with_gap(self): with tempfile.TemporaryDirectory() as tempdir: - da = xdas.DataArray( + da = xd.DataArray( data=np.random.randint( low=-1000, high=1000, size=(900, 10), dtype=np.int32 ), @@ -350,7 +350,7 @@ def test_flat(self): endtime = starttime + np.timedelta64(10, "ms") * (data.shape[0] - 1) distance = 5.0 * np.arange(data.shape[1]) - da = xdas.DataArray( + da = xd.DataArray( data=data, coords={ "time": { diff --git a/tests/test_routines.py b/tests/test_routines.py index 32d1d4da..d74e00ac 100644 --- a/tests/test_routines.py +++ b/tests/test_routines.py @@ -2,9 +2,7 @@ import pytest import xdas as xd -from xdas.core.coordinates import Coordinates -from xdas.core.dataarray import DataArray -from xdas.core.routines import Bag, CompatibilityError, combine_by_coords +from xdas.core.routines import Bag, CompatibilityError class TestBag: @@ -14,30 +12,30 @@ def test_bag_initialization(self): assert bag.objs == [] def test_bag_append_initializes(self): - da = DataArray( + da = xd.DataArray( np.random.rand(10, 5), {"time": np.arange(10), "space": np.arange(5)} ) bag = Bag(dim="time") bag.append(da) assert len(bag.objs) == 1 assert bag.objs[0] is da - assert bag.subcoords.equals(Coordinates({"space": np.arange(5)})) + assert bag.subcoords.equals(xd.Coordinates({"space": np.arange(5)})) assert bag.subshape == (5,) assert bag.dims == ("time", "space") assert bag.delta def test_bag_append_compatible(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("time", "space")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) bag = Bag(dim="time") bag.append(da1) bag.append(da2) assert len(bag.objs) == 2 assert bag.objs[1] is da2 - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), {"time": np.arange(10), "space": np.arange(5)} ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), {"time": np.arange(10, 20), "space": np.arange(5)} ) bag = Bag(dim="time") @@ -47,36 +45,38 @@ def test_bag_append_compatible(self): assert bag.objs[1] is da2 def test_bag_append_incompatible_dims(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("space", "time")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("space", "time")) bag = Bag(dim="time") bag.append(da1) with pytest.raises(CompatibilityError): bag.append(da2) def test_bag_append_incompatible_shape(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 6), dims=("time", "space")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 6), dims=("time", "space")) bag = Bag(dim="time") bag.append(da1) with pytest.raises(CompatibilityError): bag.append(da2) def test_bag_append_incompatible_dtype(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.randint(0, 10, size=(10, 5)), dims=("time", "space")) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray( + np.random.randint(0, 10, size=(10, 5)), dims=("time", "space") + ) bag = Bag(dim="time") bag.append(da1) with pytest.raises(CompatibilityError): bag.append(da2) def test_bag_append_incompatible_coords(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5) + 1}, @@ -87,12 +87,12 @@ def test_bag_append_incompatible_coords(self): bag.append(da2) def test_bag_append_incompatible_sampling_interval(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10) * 2}, @@ -106,91 +106,93 @@ def test_bag_append_incompatible_sampling_interval(self): class TestCombineByCoords: def test_basic(self): # without coords - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - combined = combine_by_coords([da1, da2], dim="time", squeeze=True) + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + combined = xd.combine_by_coords([da1, da2], dim="time", squeeze=True) assert combined.shape == (20, 5) # with coords - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), coords={"time": np.arange(10), "space": np.arange(5)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), coords={"time": np.arange(10, 20), "space": np.arange(5)}, ) - combined = combine_by_coords([da1, da2], dim="time", squeeze=True) + combined = xd.combine_by_coords([da1, da2], dim="time", squeeze=True) assert combined.shape == (20, 5) def test_incompatible_shape(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 6), dims=("time", "space")) - dc = combine_by_coords([da1, da2], dim="time") + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 6), dims=("time", "space")) + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_dims(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.rand(10, 5), dims=("space", "time")) - dc = combine_by_coords([da1, da2], dim="time") + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray(np.random.rand(10, 5), dims=("space", "time")) + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_dtype(self): - da1 = DataArray(np.random.rand(10, 5), dims=("time", "space")) - da2 = DataArray(np.random.randint(0, 10, size=(10, 5)), dims=("time", "space")) - dc = combine_by_coords([da1, da2], dim="time") + da1 = xd.DataArray(np.random.rand(10, 5), dims=("time", "space")) + da2 = xd.DataArray( + np.random.randint(0, 10, size=(10, 5)), dims=("time", "space") + ) + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_coords(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"space": np.arange(5) + 1}, ) - dc = combine_by_coords([da1, da2], dim="time") + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_incompatible_sampling_interval(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10)}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10, 5), dims=("time", "space"), coords={"time": np.arange(10) * 2}, ) - dc = combine_by_coords([da1, da2], dim="time") + dc = xd.combine_by_coords([da1, da2], dim="time") assert len(dc) == 2 assert dc[0].equals(da1) assert dc[1].equals(da2) def test_expand_scalar_coordinate(self): - da1 = DataArray( + da1 = xd.DataArray( np.random.rand(10), dims=("time",), coords={"time": np.arange(10), "space": 0}, ) - da2 = DataArray( + da2 = xd.DataArray( np.random.rand(10), dims=("time",), coords={"time": np.arange(10), "space": 1}, ) - dc = combine_by_coords([da1, da2], dim="space", squeeze=True) + dc = xd.combine_by_coords([da1, da2], dim="space", squeeze=True) assert dc.shape == (2, 10) assert dc.dims == ("space", "time") assert dc.coords["space"].values.tolist() == [0, 1] @@ -198,7 +200,7 @@ def test_expand_scalar_coordinate(self): class TestOpenMFDataArray: def test_warn_on_corrupted_files(self, tmp_path): - expected = DataArray( + expected = xd.DataArray( np.random.rand(10, 5), coords={ "time": np.arange(10), diff --git a/tests/test_virtual.py b/tests/test_virtual.py index 4e3e2a05..94b80b3d 100644 --- a/tests/test_virtual.py +++ b/tests/test_virtual.py @@ -5,7 +5,7 @@ import numpy as np import pytest -import xdas +import xdas as xd from xdas.synthetics import wavelet_wavefronts from xdas.virtual import ( Selection, @@ -22,11 +22,11 @@ class TestFunctional: # TODO: move elsewhere def test_all(self): with tempfile.TemporaryDirectory() as dirpath: expected = wavelet_wavefronts() - chunks = xdas.split(expected, 3) + chunks = xd.split(expected, 3) for index, chunk in enumerate(chunks, start=1): chunk.to_netcdf(os.path.join(dirpath, f"{index:03d}.nc")) - da = xdas.open_dataarray(os.path.join(dirpath, "002.nc")) + da = xd.open_dataarray(os.path.join(dirpath, "002.nc")) datasource = da.data assert np.allclose(np.asarray(datasource[0]), da.load().values[0]) assert np.allclose(np.asarray(datasource[0][1]), da.load().values[0][1]) @@ -68,9 +68,9 @@ def test_dtypes(self, tmp_path): np.complex128, ) for dtype in dtypes: - expected = xdas.DataArray(np.zeros((3, 5), dtype=dtype)) + expected = xd.DataArray(np.zeros((3, 5), dtype=dtype)) expected.to_netcdf(tmp_path / "data.nc") - result = xdas.open_dataarray(tmp_path / "data.nc") + result = xd.open_dataarray(tmp_path / "data.nc") assert result.equals(expected) diff --git a/tests/test_xarray.py b/tests/test_xarray.py index 382a4fe6..9e87a62a 100644 --- a/tests/test_xarray.py +++ b/tests/test_xarray.py @@ -1,7 +1,7 @@ import numpy as np +import xdas as xd import xdas.core.methods as xm -from xdas.core.dataarray import DataArray from xdas.synthetics import wavelet_wavefronts @@ -15,19 +15,19 @@ def test_returns_dataarray(self): "quantile", ]: result = func(da, 0.5) - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) result = getattr(da, name)(0.5) - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) elif name == "diff": result = func(da, "time") - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) result = getattr(da, name)("time") - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) else: result = func(da) - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) result = getattr(da, name)() - assert isinstance(result, DataArray) + assert isinstance(result, xd.DataArray) def test_mean(self): da = wavelet_wavefronts() diff --git a/xdas/__init__.py b/xdas/__init__.py index e9a85cb5..44220eb0 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -1,11 +1,6 @@ -from . import atoms, config, fft, io, parallel, processing, signal, synthetics, virtual -from .core import coordinates, dataarray, datacollection, methods, numpy, routines -from .core.coordinates import ( +from .coordinates import ( Coordinate, Coordinates, - DenseCoordinate, - InterpCoordinate, - ScalarCoordinate, get_sampling_interval, ) from .core.dataarray import DataArray diff --git a/xdas/atoms/ml.py b/xdas/atoms/ml.py index f3fc4faf..5add9e22 100644 --- a/xdas/atoms/ml.py +++ b/xdas/atoms/ml.py @@ -2,9 +2,9 @@ import numpy as np -from ..atoms import Atom, State from ..core.dataarray import DataArray from ..core.routines import concatenate +from .core import Atom, State class LazyModule: diff --git a/xdas/atoms/signal.py b/xdas/atoms/signal.py index 7378a7d5..21f4027b 100644 --- a/xdas/atoms/signal.py +++ b/xdas/atoms/signal.py @@ -3,7 +3,7 @@ import numpy as np import scipy.signal as sp -from ..core.coordinates import Coordinate, get_sampling_interval +from ..coordinates.core import Coordinate, get_sampling_interval from ..core.dataarray import DataArray from ..core.routines import concatenate, split from ..parallel import parallelize diff --git a/xdas/core/coordinates/__init__.py b/xdas/coordinates/__init__.py similarity index 100% rename from xdas/core/coordinates/__init__.py rename to xdas/coordinates/__init__.py diff --git a/xdas/core/coordinates/core.py b/xdas/coordinates/core.py similarity index 99% rename from xdas/core/coordinates/core.py rename to xdas/coordinates/core.py index f70c36cd..d9e6813e 100644 --- a/xdas/core/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -421,7 +421,7 @@ def append(self, other): raise NotImplementedError(f"append is not implemented for {self.__class__}") def to_dataarray(self): - from ..dataarray import DataArray # TODO: avoid defered import? + from ..core.dataarray import DataArray # TODO: avoid defered import? if self.name is None: raise ValueError("cannot convert unnamed coordinate to DataArray") diff --git a/xdas/core/coordinates/default.py b/xdas/coordinates/default.py similarity index 100% rename from xdas/core/coordinates/default.py rename to xdas/coordinates/default.py diff --git a/xdas/core/coordinates/dense.py b/xdas/coordinates/dense.py similarity index 100% rename from xdas/core/coordinates/dense.py rename to xdas/coordinates/dense.py diff --git a/xdas/core/coordinates/interp.py b/xdas/coordinates/interp.py similarity index 100% rename from xdas/core/coordinates/interp.py rename to xdas/coordinates/interp.py diff --git a/xdas/core/coordinates/sampled.py b/xdas/coordinates/sampled.py similarity index 100% rename from xdas/core/coordinates/sampled.py rename to xdas/coordinates/sampled.py diff --git a/xdas/core/coordinates/scalar.py b/xdas/coordinates/scalar.py similarity index 100% rename from xdas/core/coordinates/scalar.py rename to xdas/coordinates/scalar.py diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index e3ba7a9a..468aa4d6 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -1,6 +1,4 @@ import copy -import json -import re import warnings from functools import partial @@ -12,9 +10,9 @@ from dask.array import Array as DaskArray from numpy.lib.mixins import NDArrayOperatorsMixin +from ..coordinates import Coordinates, get_sampling_interval from ..dask.core import create_variable, from_dict, loads, to_dict from ..virtual import VirtualArray, VirtualSource, _to_human -from .coordinates import Coordinate, Coordinates, get_sampling_interval HANDLED_NUMPY_FUNCTIONS = {} HANDLED_METHODS = {} diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 187c88d8..f34c4b8c 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -11,8 +11,9 @@ import xarray as xr from tqdm import tqdm +from ..coordinates.core import Coordinates, get_sampling_interval +from ..coordinates.interp import InterpCoordinate from ..virtual import VirtualSource, VirtualStack -from .coordinates import Coordinates, InterpCoordinate, get_sampling_interval from .dataarray import DataArray from .datacollection import DataCollection, DataMapping, DataSequence diff --git a/xdas/fft.py b/xdas/fft.py index 5d6ca31d..bb47bb89 100644 --- a/xdas/fft.py +++ b/xdas/fft.py @@ -1,7 +1,7 @@ import numpy as np from .atoms.core import atomized -from .core.coordinates import get_sampling_interval +from .coordinates.core import get_sampling_interval from .core.dataarray import DataArray from .parallel import parallelize diff --git a/xdas/io/__init__.py b/xdas/io/__init__.py index 54f6f22e..926a1a90 100644 --- a/xdas/io/__init__.py +++ b/xdas/io/__init__.py @@ -1,2 +1 @@ -from . import apsensing, asn, febus, miniseed, optasense, silixa, sintela, terra15 from .core import get_free_port diff --git a/xdas/io/asn.py b/xdas/io/asn.py index 7dc42344..44da7a5d 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -4,8 +4,7 @@ import numpy as np import zmq -from xdas.core.coordinates import get_sampling_interval - +from ..coordinates.core import get_sampling_interval from ..core.dataarray import DataArray from ..virtual import VirtualSource diff --git a/xdas/io/miniseed.py b/xdas/io/miniseed.py index 37e74610..6a2d4691 100644 --- a/xdas/io/miniseed.py +++ b/xdas/io/miniseed.py @@ -2,7 +2,7 @@ import numpy as np import obspy -from ..core.coordinates import Coordinate, Coordinates +from ..coordinates.core import Coordinate, Coordinates from ..core.dataarray import DataArray diff --git a/xdas/processing/__init__.py b/xdas/processing/__init__.py index b8ffae40..2cbe1d76 100644 --- a/xdas/processing/__init__.py +++ b/xdas/processing/__init__.py @@ -1,4 +1,3 @@ -from . import monitor from .core import ( DataArrayLoader, DataArrayWriter, diff --git a/xdas/signal.py b/xdas/signal.py index 5d155527..fefb4d53 100644 --- a/xdas/signal.py +++ b/xdas/signal.py @@ -2,7 +2,7 @@ import scipy.signal as sp from .atoms.core import atomized -from .core.coordinates import Coordinate, get_sampling_interval +from .coordinates.core import Coordinate, get_sampling_interval from .core.dataarray import DataArray from .parallel import parallelize from .spectral import stft diff --git a/xdas/spectral.py b/xdas/spectral.py index 44485ad3..c4a6e279 100644 --- a/xdas/spectral.py +++ b/xdas/spectral.py @@ -2,7 +2,7 @@ from scipy.fft import fft, fftfreq, fftshift, rfft, rfftfreq from scipy.signal import get_window -from .core.coordinates import get_sampling_interval +from .coordinates.core import get_sampling_interval from .core.dataarray import DataArray from .parallel import parallelize diff --git a/xdas/trigger.py b/xdas/trigger.py index b0a62eed..02142b6d 100644 --- a/xdas/trigger.py +++ b/xdas/trigger.py @@ -3,7 +3,7 @@ from numba import njit from .atoms.core import Atom, State, atomized -from .core.coordinates import Coordinate +from .coordinates.core import Coordinate class Trigger(Atom): From 1c51119b3d175b56bba52105f54e882404a33bde Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 16:19:06 +0100 Subject: [PATCH 042/104] Refactor: put dense and interp coord tests in separate files. --- tests/coordinates/test_coordinates.py | 512 ++------------------------ tests/coordinates/test_dense.py | 137 +++++++ tests/coordinates/test_interp.py | 318 ++++++++++++++++ 3 files changed, 487 insertions(+), 480 deletions(-) create mode 100644 tests/coordinates/test_dense.py create mode 100644 tests/coordinates/test_interp.py diff --git a/tests/coordinates/test_coordinates.py b/tests/coordinates/test_coordinates.py index 2a3166a1..d0825bd4 100644 --- a/tests/coordinates/test_coordinates.py +++ b/tests/coordinates/test_coordinates.py @@ -1,516 +1,68 @@ import numpy as np -import pandas as pd import pytest -import xdas -from xdas.coordinates import DenseCoordinate, InterpCoordinate, ScalarCoordinate - - -class TestDenseCoordinate: - valid = [ - [1, 2, 3], - np.array([1, 2, 3]), - [1.0, 2.0, 3.0], - np.array([1.0, 2.0, 3.0]), - ["a", "b", "c"], - np.array(["a", "b", "c"]), - np.array([1, 2, 3], dtype="datetime64[s]"), - ] - invalid = [ - 1, - np.array(1), - 1.0, - np.array(1.0), - "label", - np.array("label"), - np.datetime64(1, "s"), - {"key": "value"}, - ] - - def test_isvalid(self): - for data in self.valid: - assert DenseCoordinate.isvalid(data) - for data in self.invalid: - assert not DenseCoordinate.isvalid(data) - - def test_init(self): - coord = DenseCoordinate([1, 2, 3]) - assert np.array_equiv(coord.data, [1, 2, 3]) - assert coord.dim is None - coord = DenseCoordinate([1, 2, 3], "dim") - assert coord.dim == "dim" - for data in self.valid: - assert np.array_equiv(DenseCoordinate(data).data, data) - for data in self.invalid: - with pytest.raises(TypeError): - DenseCoordinate(data) - - def test_getitem(self): - assert np.array_equiv(DenseCoordinate([1, 2, 3])[...].values, [1, 2, 3]) - assert isinstance(DenseCoordinate([1, 2, 3])[...], DenseCoordinate) - assert np.array_equiv(DenseCoordinate([1, 2, 3])[:].values, [1, 2, 3]) - assert isinstance(DenseCoordinate([1, 2, 3])[:], DenseCoordinate) - assert np.array_equiv(DenseCoordinate([1, 2, 3])[1].values, 2) - assert isinstance(DenseCoordinate([1, 2, 3])[1], ScalarCoordinate) - assert np.array_equiv(DenseCoordinate([1, 2, 3])[1:].values, [2, 3]) - assert isinstance(DenseCoordinate([1, 2, 3])[1:], DenseCoordinate) - - def test_len(self): - for data in self.valid: - assert len(DenseCoordinate(data)) == 3 - - def test_repr(self): - for data in self.valid: - assert DenseCoordinate(data).__repr__() == np.array2string( - np.asarray(data), threshold=0, edgeitems=1 - ) - - def test_array(self): - for data in self.valid: - assert np.array_equiv(DenseCoordinate(data).__array__(), data) - - def test_dtype(self): - for data in self.valid: - assert DenseCoordinate(data).dtype == np.array(data).dtype - - def test_values(self): - for data in self.valid: - assert np.array_equiv(DenseCoordinate(data).values, data) - - def test_index(self): - for data in self.valid: - assert DenseCoordinate(data).index.equals(pd.Index(data)) - - def test_equals(self): - for data in self.valid: - coord = DenseCoordinate(data) - assert coord.equals(coord) - assert DenseCoordinate([1, 2, 3]).equals(DenseCoordinate([1, 2, 3])) - - def test_isinstance(self): - assert not DenseCoordinate([1, 2, 3]).isscalar() - assert DenseCoordinate([1, 2, 3]).isdense() - assert not DenseCoordinate([1, 2, 3]).isinterp() - - def test_get_indexer(self): - assert DenseCoordinate([1, 2, 3]).get_indexer(2) == 1 - assert np.array_equiv(DenseCoordinate([1, 2, 3]).get_indexer([2, 3]), [1, 2]) - assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="nearest") == 1 - assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="ffill") == 1 - assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="bfill") == 2 - - def test_get_slice_indexer(self): - assert np.array_equiv( - DenseCoordinate([1, 2, 3]).slice_indexer(start=2), slice(1, 3) - ) - - def test_to_index(self): - assert DenseCoordinate([1, 2, 3]).to_index(2) == 1 - assert np.array_equiv(DenseCoordinate([1, 2, 3]).to_index([2, 3]), [1, 2]) - assert np.array_equiv( - DenseCoordinate([1, 2, 3]).to_index(slice(2, None)), slice(1, 3) - ) - - def test_to_from_dict(self): - for data in self.valid: - coord = DenseCoordinate(data) - assert DenseCoordinate.from_dict(coord.to_dict()).equals(coord) - - def test_empty(self): - coord = DenseCoordinate() - assert coord.empty - - def test_append(self): - coord0 = DenseCoordinate() - coord1 = DenseCoordinate([1, 2, 3]) - coord2 = DenseCoordinate([4, 5, 6]) - - result = coord1.append(coord2) - expected = DenseCoordinate([1, 2, 3, 4, 5, 6]) - assert result.equals(expected) - - result = coord2.append(coord1) - expected = DenseCoordinate([4, 5, 6, 1, 2, 3]) - assert result.equals(expected) - - assert coord0.append(coord0).empty - assert coord0.append(coord1).equals(coord1) - assert coord1.append(coord0).equals(coord1) - - -class TestInterpCoordinate: - valid = [ - {"tie_indices": [], "tie_values": []}, - {"tie_indices": [0], "tie_values": [100.0]}, - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, - {"tie_indices": [0, 8], "tie_values": [100, 900]}, - { - "tie_indices": [0, 8], - "tie_values": [ - np.datetime64("2000-01-01T00:00:00"), - np.datetime64("2000-01-01T00:00:08"), - ], - }, - {"tie_indices": np.array([0, 8], dtype="int16"), "tie_values": [100.0, 900.0]}, - ] - invalid = [ - 1, - np.array(1), - 1.0, - np.array(1.0), - "label", - np.array("label"), - np.datetime64(1, "s"), - [1, 2, 3], - np.array([1, 2, 3]), - [1.0, 2.0, 3.0], - np.array([1.0, 2.0, 3.0]), - ["a", "b", "c"], - np.array(["a", "b", "c"]), - np.array([1, 2, 3], dtype="datetime64[s]"), - {"key": "value"}, - ] - error = [ - {"tie_indices": 0, "tie_values": [100.0]}, - {"tie_indices": [0], "tie_values": 100.0}, - {"tie_indices": [0, 7, 8], "tie_values": [100.0, 900.0]}, - {"tie_indices": [0.0, 8.0], "tie_values": [100.0, 900.0]}, - {"tie_indices": [1, 9], "tie_values": [100.0, 900.0]}, - {"tie_indices": [8, 0], "tie_values": [100.0, 900.0]}, - {"tie_indices": [8, 0], "tie_values": ["a", "b"]}, - ] - - def test_isvalid(self): - for data in self.valid: - assert InterpCoordinate.isvalid(data) - for data in self.invalid: - assert not InterpCoordinate.isvalid(data) - - def test_init(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.array_equiv(coord.data["tie_indices"], [0, 8]) - assert np.array_equiv(coord.data["tie_values"], [100.0, 900.0]) - assert coord.dim is None - coord = InterpCoordinate( - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, "dim" - ) - assert coord.dim == "dim" - for data in self.valid: - coord = InterpCoordinate(data) - assert np.array_equiv(coord.data["tie_indices"], data["tie_indices"]) - assert np.array_equiv(coord.data["tie_values"], data["tie_values"]) - for data in self.invalid: - with pytest.raises(TypeError): - InterpCoordinate(data) - for data in self.error: - with pytest.raises(ValueError): - InterpCoordinate(data) - - def test_len(self): - assert ( - len(InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]})) - == 9 - ) - assert len(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) == 0 - - @pytest.mark.parametrize("valid_input", valid) - def test_repr(self, valid_input): - coord = InterpCoordinate(data=valid_input) - my_coord = repr(coord) - assert isinstance(my_coord, str) - - def test_equals(self): - coord1 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - coord2 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord1.equals(coord2) - - def test_getitem(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert isinstance(coord[0], ScalarCoordinate) - assert coord[0].values == 100.0 - assert coord[4].values == 500.0 - assert coord[8].values == 900.0 - assert coord[-1].values == 900.0 - assert coord[-2].values == 800.0 - assert np.allclose(coord[[1, 2, 3]].values, [200.0, 300.0, 400.0]) - with pytest.raises(IndexError): - coord[9] - coord[-9] - assert coord[0:2].equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) - ) - assert coord[:].equals(coord) - assert coord[6:3].equals(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) - assert coord[1:2].equals( - InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) - ) - assert coord[-3:-1].equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[700.0, 800.0])) - ) - - def test_setitem(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - with pytest.raises(TypeError): - coord[1] = 0 - coord[:] = 0 - - def test_asarray(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.allclose(np.asarray(coord), coord.values) - - def test_empty(self): - assert not InterpCoordinate( - {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]} - ).empty - assert InterpCoordinate(dict(tie_indices=[], tie_values=[])).empty - - def test_dtype(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.dtype == np.float64 - - def test_ndim(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.ndim == 1 - assert isinstance(coord.ndim, int) - - def test_shape(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.shape == (9,) - - def test_format_index(self): - # TODO - pass - - def test_format_index_slice(self): - # TODO - pass - - def test_get_value(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.get_value(0) == 100.0 - assert coord.get_value(4) == 500.0 - assert coord.get_value(8) == 900.0 - assert coord.get_value(-1) == 900.0 - assert coord.get_value(-9) == 100.0 - assert np.allclose(coord.get_value([1, 2, 3, -2]), [200.0, 300.0, 400.0, 800.0]) - with pytest.raises(IndexError): - coord.get_value(-10) - coord.get_value(9) - coord.get_value(0.5) - starttime = np.datetime64("2000-01-01T00:00:00") - endtime = np.datetime64("2000-01-01T00:00:08") - coord = InterpCoordinate( - dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) - ) - assert coord.get_value(0) == starttime - assert coord.get_value(4) == np.datetime64("2000-01-01T00:00:04") - assert coord.get_value(8) == endtime - assert coord.get_value(-1) == endtime - assert coord.get_value(-9) == starttime - - def test_get_index(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.get_indexer(100.0) == 0 - assert coord.get_indexer(900.0) == 8 - assert coord.get_indexer(0.0, "nearest") == 0 - assert coord.get_indexer(1000.0, "nearest") == 8 - assert coord.get_indexer(125.0, "nearest") == 0 - assert coord.get_indexer(175.0, "nearest") == 1 - assert coord.get_indexer(175.0, "ffill") == 0 - assert coord.get_indexer(200.0, "ffill") == 1 - assert coord.get_indexer(200.0, "bfill") == 1 - assert coord.get_indexer(125.0, "bfill") == 1 - assert np.all(np.equal(coord.get_indexer([100.0, 900.0]), [0, 8])) - with pytest.raises(KeyError): - assert coord.get_indexer(0.0) == 0 - assert coord.get_indexer(1000.0) == 8 - assert coord.get_indexer(150.0) == 0 - assert coord.get_indexer(1000.0, "bfill") == 8 - assert coord.get_indexer(0.0, "ffill") == 0 - - starttime = np.datetime64("2000-01-01T00:00:00") - endtime = np.datetime64("2000-01-01T00:00:08") - coord = InterpCoordinate( - dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) - ) - assert coord.get_indexer(starttime) == 0 - assert coord.get_indexer(endtime) == 8 - assert coord.get_indexer(str(starttime)) == 0 - assert coord.get_indexer(str(endtime)) == 8 - assert coord.get_indexer("2000-01-01T00:00:04.1", "nearest") == 4 - - def test_indices(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.all(np.equal(coord.indices, np.arange(9))) - - def test_values(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert np.allclose(coord.values, np.arange(100.0, 1000.0, 100.0)) - - def test_get_index_slice(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.slice_indexer(100.0, 200.0) == slice(0, 2) - assert coord.slice_indexer(150.0, 250.0) == slice(1, 2) - assert coord.slice_indexer(300.0, 500.0) == slice(2, 5) - assert coord.slice_indexer(0.0, 500.0) == slice(0, 5) - assert coord.slice_indexer(125.0, 175.0) == slice(1, 1) - assert coord.slice_indexer(0.0, 50.0) == slice(0, 0) - assert coord.slice_indexer(1000.0, 1100.0) == slice(9, 9) - assert coord.slice_indexer(1000.0, 500.0) == slice(9, 5) - assert coord.slice_indexer(None, None) == slice(None, None) - - def test_slice_index(self): - coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) - assert coord.slice_index(slice(0, 2)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) - ) - assert coord.slice_index(slice(7, None)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) - ) - assert coord.slice_index(slice(None, None)).equals(coord) - assert coord.slice_index(slice(0, 0)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(4, 2)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(9, 9)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(3, 3)).equals( - InterpCoordinate(dict(tie_indices=[], tie_values=[])) - ) - assert coord.slice_index(slice(0, -1)).equals( - InterpCoordinate(dict(tie_indices=[0, 7], tie_values=[100.0, 800.0])) - ) - assert coord.slice_index(slice(0, -2)).equals( - InterpCoordinate(dict(tie_indices=[0, 6], tie_values=[100.0, 700.0])) - ) - assert coord.slice_index(slice(-2, None)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) - ) - assert coord.slice_index(slice(1, 2)).equals( - InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) - ) - assert coord.slice_index(slice(1, 3, 2)).equals( - InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) - ) - assert coord.slice_index(slice(None, None, 2)).equals( - InterpCoordinate(dict(tie_indices=[0, 4], tie_values=[100.0, 900.0])) - ) - assert coord.slice_index(slice(None, None, 3)).equals( - InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 700.0])) - ) - assert coord.slice_index(slice(None, None, 4)).equals( - InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 900.0])) - ) - assert coord.slice_index(slice(None, None, 5)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 600.0])) - ) - assert coord.slice_index(slice(2, 7, 3)).equals( - InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[300.0, 600.0])) - ) - - def test_to_index(self): - # TODO - pass - - def test_simplify(self): - xp = np.sort(np.random.choice(10000, 1000, replace=False)) - xp[0] = 0 - xp[-1] = 10000 - yp = xp + (np.random.rand(1000) - 0.5) - coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) - assert len(coord.simplify(1.0).tie_indices) == 2 - - def test_singleton(self): - coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]}) - assert coord[0].values == 1.0 - - def test_to_from_dict(self): - for data in self.valid: - coord = InterpCoordinate(data) - assert InterpCoordinate.from_dict(coord.to_dict()).equals(coord) - - def test_append(self): - coord0 = InterpCoordinate() - coord1 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [0, 20]}) - coord2 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [30, 50]}) - - result = coord1.append(coord2).simplify() - expected = InterpCoordinate({"tie_indices": [0, 5], "tie_values": [0, 50]}) - assert result.equals(expected) - - result = coord2.append(coord1).simplify() - expected = InterpCoordinate( - {"tie_indices": [0, 2, 3, 5], "tie_values": [30, 50, 0, 20]} - ) - assert result.equals(expected) - - assert coord0.append(coord0).empty - assert coord0.append(coord1).equals(coord1) - assert coord1.append(coord0).equals(coord1) +import xdas as xd class TestCoordinate: def test_new(self): - assert xdas.Coordinate(1).isscalar() - assert xdas.Coordinate([1]).isdense() - assert xdas.Coordinate({"tie_values": [], "tie_indices": []}).isinterp() - coord = xdas.Coordinate(xdas.Coordinate([1]), "dim") + assert xd.Coordinate(1).isscalar() + assert xd.Coordinate([1]).isdense() + assert xd.Coordinate({"tie_values": [], "tie_indices": []}).isinterp() + coord = xd.Coordinate(xd.Coordinate([1]), "dim") assert coord.isdense() assert coord.dim == "dim" def test_to_dataarray(self): - coord = xdas.Coordinate([1, 2, 3], "dim") + coord = xd.Coordinate([1, 2, 3], "dim") result = coord.to_dataarray() - expected = xdas.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") + expected = xd.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") assert result.equals(expected) def test_empty(self): with pytest.raises(TypeError, match="cannot infer coordinate type"): - xdas.Coordinate() + xd.Coordinate() def test_isdim(self): - coord = xdas.Coordinate([1, 2, 3]) + coord = xd.Coordinate([1, 2, 3]) assert coord.isdim() is None - coord = xdas.Coordinate([1, 2, 3], "dim") + coord = xd.Coordinate([1, 2, 3], "dim") assert coord.isdim() is None - coords = xdas.Coordinates({"dim": coord}) + coords = xd.Coordinates({"dim": coord}) assert coords["dim"].isdim() - coords = xdas.Coordinates({"other_dim": coord}) + coords = xd.Coordinates({"other_dim": coord}) assert not coords["other_dim"].isdim() def test_name(self): - coord = xdas.Coordinate([1, 2, 3]) + coord = xd.Coordinate([1, 2, 3]) assert coord.name is None - coord = xdas.Coordinate([1, 2, 3], "dim") + coord = xd.Coordinate([1, 2, 3], "dim") assert coord.name == "dim" - coords = xdas.Coordinates({"dim": coord}) + coords = xd.Coordinates({"dim": coord}) assert coords["dim"].name == "dim" - coords = xdas.Coordinates({"other_dim": coord}) + coords = xd.Coordinates({"other_dim": coord}) assert coords["other_dim"].name == "other_dim" def test_to_dataarray(self): - coord = xdas.Coordinate([1, 2, 3]) + coord = xd.Coordinate([1, 2, 3]) with pytest.raises(ValueError, match="unnamed coordinate"): coord.to_dataarray() - coord = xdas.Coordinate([1, 2, 3], "dim") + coord = xd.Coordinate([1, 2, 3], "dim") result = coord.to_dataarray() - expected = xdas.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") + expected = xd.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") assert result.equals(expected) - coords = xdas.Coordinates({"dim": coord}) + coords = xd.Coordinates({"dim": coord}) result = coords["dim"].to_dataarray() assert result.equals(expected) - coords = xdas.Coordinates({"other_dim": coord}) + coords = xd.Coordinates({"other_dim": coord}) result = coords["other_dim"].to_dataarray() - expected = xdas.DataArray( + expected = xd.DataArray( [1, 2, 3], coords={"other_dim": coord}, dims=["dim"], name="other_dim" ) assert result.equals(expected) coords["dim"] = [4, 5, 6] result = coords["dim"].to_dataarray() - expected = xdas.DataArray( + expected = xd.DataArray( [4, 5, 6], coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, dims=["dim"], @@ -518,7 +70,7 @@ def test_to_dataarray(self): ) assert result.equals(expected) result = coords["other_dim"].to_dataarray() - expected = xdas.DataArray( + expected = xd.DataArray( [1, 2, 3], coords={"dim": [4, 5, 6], "other_dim": ("dim", [1, 2, 3])}, dims=["dim"], @@ -529,7 +81,7 @@ def test_to_dataarray(self): class TestCoordinates: def test_init(self): - coords = xdas.Coordinates( + coords = xd.Coordinates( {"dim": ("dim", {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]})} ) coord = coords["dim"] @@ -537,12 +89,12 @@ def test_init(self): assert np.allclose(coord.tie_indices, [0, 8]) assert np.allclose(coord.tie_values, [100.0, 900.0]) assert coords.isdim("dim") - coords = xdas.Coordinates({"dim": [1.0, 2.0, 3.0]}) + coords = xd.Coordinates({"dim": [1.0, 2.0, 3.0]}) coord = coords["dim"] assert coord.isdense() assert np.allclose(coord.values, [1.0, 2.0, 3.0]) assert coords.isdim("dim") - coords = xdas.Coordinates( + coords = xd.Coordinates( { "dim_0": ( "dim_0", @@ -556,17 +108,17 @@ def test_init(self): ) assert coords.isdim("dim_0") assert not coords.isdim("dim_1") - coords = xdas.Coordinates() + coords = xd.Coordinates() assert coords == dict() assert coords.dims == tuple() def test_first_last(self): - coords = xdas.Coordinates({"dim_0": [1.0, 2.0, 3.0], "dim_1": [1.0, 2.0, 3.0]}) + coords = xd.Coordinates({"dim_0": [1.0, 2.0, 3.0], "dim_1": [1.0, 2.0, 3.0]}) assert coords["first"].dim == "dim_0" assert coords["last"].dim == "dim_1" def test_setitem(self): - coords = xdas.Coordinates() + coords = xd.Coordinates() coords["dim_0"] = [1, 2, 4] assert coords.dims == ("dim_0",) coords["dim_1"] = {"tie_indices": [0, 10], "tie_values": [0.0, 100.0]} @@ -591,5 +143,5 @@ def test_to_from_dict(self): "channel": ("distance", ["DAS01", "DAS02", "DAS03"]), "interrogator": (None, "SRN"), } - coords = xdas.Coordinates(coords) - assert xdas.Coordinates.from_dict(coords.to_dict()).equals(coords) + coords = xd.Coordinates(coords) + assert xd.Coordinates.from_dict(coords.to_dict()).equals(coords) diff --git a/tests/coordinates/test_dense.py b/tests/coordinates/test_dense.py new file mode 100644 index 00000000..0524bac5 --- /dev/null +++ b/tests/coordinates/test_dense.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd +import pytest + +from xdas.coordinates import DenseCoordinate, ScalarCoordinate + + +class TestDenseCoordinate: + valid = [ + [1, 2, 3], + np.array([1, 2, 3]), + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + ["a", "b", "c"], + np.array(["a", "b", "c"]), + np.array([1, 2, 3], dtype="datetime64[s]"), + ] + invalid = [ + 1, + np.array(1), + 1.0, + np.array(1.0), + "label", + np.array("label"), + np.datetime64(1, "s"), + {"key": "value"}, + ] + + def test_isvalid(self): + for data in self.valid: + assert DenseCoordinate.isvalid(data) + for data in self.invalid: + assert not DenseCoordinate.isvalid(data) + + def test_init(self): + coord = DenseCoordinate([1, 2, 3]) + assert np.array_equiv(coord.data, [1, 2, 3]) + assert coord.dim is None + coord = DenseCoordinate([1, 2, 3], "dim") + assert coord.dim == "dim" + for data in self.valid: + assert np.array_equiv(DenseCoordinate(data).data, data) + for data in self.invalid: + with pytest.raises(TypeError): + DenseCoordinate(data) + + def test_getitem(self): + assert np.array_equiv(DenseCoordinate([1, 2, 3])[...].values, [1, 2, 3]) + assert isinstance(DenseCoordinate([1, 2, 3])[...], DenseCoordinate) + assert np.array_equiv(DenseCoordinate([1, 2, 3])[:].values, [1, 2, 3]) + assert isinstance(DenseCoordinate([1, 2, 3])[:], DenseCoordinate) + assert np.array_equiv(DenseCoordinate([1, 2, 3])[1].values, 2) + assert isinstance(DenseCoordinate([1, 2, 3])[1], ScalarCoordinate) + assert np.array_equiv(DenseCoordinate([1, 2, 3])[1:].values, [2, 3]) + assert isinstance(DenseCoordinate([1, 2, 3])[1:], DenseCoordinate) + + def test_len(self): + for data in self.valid: + assert len(DenseCoordinate(data)) == 3 + + def test_repr(self): + for data in self.valid: + assert DenseCoordinate(data).__repr__() == np.array2string( + np.asarray(data), threshold=0, edgeitems=1 + ) + + def test_array(self): + for data in self.valid: + assert np.array_equiv(DenseCoordinate(data).__array__(), data) + + def test_dtype(self): + for data in self.valid: + assert DenseCoordinate(data).dtype == np.array(data).dtype + + def test_values(self): + for data in self.valid: + assert np.array_equiv(DenseCoordinate(data).values, data) + + def test_index(self): + for data in self.valid: + assert DenseCoordinate(data).index.equals(pd.Index(data)) + + def test_equals(self): + for data in self.valid: + coord = DenseCoordinate(data) + assert coord.equals(coord) + assert DenseCoordinate([1, 2, 3]).equals(DenseCoordinate([1, 2, 3])) + + def test_isinstance(self): + assert not DenseCoordinate([1, 2, 3]).isscalar() + assert DenseCoordinate([1, 2, 3]).isdense() + assert not DenseCoordinate([1, 2, 3]).isinterp() + + def test_get_indexer(self): + assert DenseCoordinate([1, 2, 3]).get_indexer(2) == 1 + assert np.array_equiv(DenseCoordinate([1, 2, 3]).get_indexer([2, 3]), [1, 2]) + assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="nearest") == 1 + assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="ffill") == 1 + assert DenseCoordinate([1, 2, 3]).get_indexer(2.1, method="bfill") == 2 + + def test_get_slice_indexer(self): + assert np.array_equiv( + DenseCoordinate([1, 2, 3]).slice_indexer(start=2), slice(1, 3) + ) + + def test_to_index(self): + assert DenseCoordinate([1, 2, 3]).to_index(2) == 1 + assert np.array_equiv(DenseCoordinate([1, 2, 3]).to_index([2, 3]), [1, 2]) + assert np.array_equiv( + DenseCoordinate([1, 2, 3]).to_index(slice(2, None)), slice(1, 3) + ) + + def test_to_from_dict(self): + for data in self.valid: + coord = DenseCoordinate(data) + assert DenseCoordinate.from_dict(coord.to_dict()).equals(coord) + + def test_empty(self): + coord = DenseCoordinate() + assert coord.empty + + def test_append(self): + coord0 = DenseCoordinate() + coord1 = DenseCoordinate([1, 2, 3]) + coord2 = DenseCoordinate([4, 5, 6]) + + result = coord1.append(coord2) + expected = DenseCoordinate([1, 2, 3, 4, 5, 6]) + assert result.equals(expected) + + result = coord2.append(coord1) + expected = DenseCoordinate([4, 5, 6, 1, 2, 3]) + assert result.equals(expected) + + assert coord0.append(coord0).empty + assert coord0.append(coord1).equals(coord1) + assert coord1.append(coord0).equals(coord1) diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py new file mode 100644 index 00000000..95b1d77c --- /dev/null +++ b/tests/coordinates/test_interp.py @@ -0,0 +1,318 @@ +import numpy as np +import pytest + +from xdas.coordinates import InterpCoordinate, ScalarCoordinate + + +class TestInterpCoordinate: + valid = [ + {"tie_indices": [], "tie_values": []}, + {"tie_indices": [0], "tie_values": [100.0]}, + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, + {"tie_indices": [0, 8], "tie_values": [100, 900]}, + { + "tie_indices": [0, 8], + "tie_values": [ + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-01T00:00:08"), + ], + }, + {"tie_indices": np.array([0, 8], dtype="int16"), "tie_values": [100.0, 900.0]}, + ] + invalid = [ + 1, + np.array(1), + 1.0, + np.array(1.0), + "label", + np.array("label"), + np.datetime64(1, "s"), + [1, 2, 3], + np.array([1, 2, 3]), + [1.0, 2.0, 3.0], + np.array([1.0, 2.0, 3.0]), + ["a", "b", "c"], + np.array(["a", "b", "c"]), + np.array([1, 2, 3], dtype="datetime64[s]"), + {"key": "value"}, + ] + error = [ + {"tie_indices": 0, "tie_values": [100.0]}, + {"tie_indices": [0], "tie_values": 100.0}, + {"tie_indices": [0, 7, 8], "tie_values": [100.0, 900.0]}, + {"tie_indices": [0.0, 8.0], "tie_values": [100.0, 900.0]}, + {"tie_indices": [1, 9], "tie_values": [100.0, 900.0]}, + {"tie_indices": [8, 0], "tie_values": [100.0, 900.0]}, + {"tie_indices": [8, 0], "tie_values": ["a", "b"]}, + ] + + def test_isvalid(self): + for data in self.valid: + assert InterpCoordinate.isvalid(data) + for data in self.invalid: + assert not InterpCoordinate.isvalid(data) + + def test_init(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.array_equiv(coord.data["tie_indices"], [0, 8]) + assert np.array_equiv(coord.data["tie_values"], [100.0, 900.0]) + assert coord.dim is None + coord = InterpCoordinate( + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}, "dim" + ) + assert coord.dim == "dim" + for data in self.valid: + coord = InterpCoordinate(data) + assert np.array_equiv(coord.data["tie_indices"], data["tie_indices"]) + assert np.array_equiv(coord.data["tie_values"], data["tie_values"]) + for data in self.invalid: + with pytest.raises(TypeError): + InterpCoordinate(data) + for data in self.error: + with pytest.raises(ValueError): + InterpCoordinate(data) + + def test_len(self): + assert ( + len(InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]})) + == 9 + ) + assert len(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) == 0 + + @pytest.mark.parametrize("valid_input", valid) + def test_repr(self, valid_input): + coord = InterpCoordinate(data=valid_input) + my_coord = repr(coord) + assert isinstance(my_coord, str) + + def test_equals(self): + coord1 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + coord2 = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord1.equals(coord2) + + def test_getitem(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert isinstance(coord[0], ScalarCoordinate) + assert coord[0].values == 100.0 + assert coord[4].values == 500.0 + assert coord[8].values == 900.0 + assert coord[-1].values == 900.0 + assert coord[-2].values == 800.0 + assert np.allclose(coord[[1, 2, 3]].values, [200.0, 300.0, 400.0]) + with pytest.raises(IndexError): + coord[9] + coord[-9] + assert coord[0:2].equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) + ) + assert coord[:].equals(coord) + assert coord[6:3].equals(InterpCoordinate(dict(tie_indices=[], tie_values=[]))) + assert coord[1:2].equals( + InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) + ) + assert coord[-3:-1].equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[700.0, 800.0])) + ) + + def test_setitem(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + with pytest.raises(TypeError): + coord[1] = 0 + coord[:] = 0 + + def test_asarray(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.allclose(np.asarray(coord), coord.values) + + def test_empty(self): + assert not InterpCoordinate( + {"tie_indices": [0, 8], "tie_values": [100.0, 900.0]} + ).empty + assert InterpCoordinate(dict(tie_indices=[], tie_values=[])).empty + + def test_dtype(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.dtype == np.float64 + + def test_ndim(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.ndim == 1 + assert isinstance(coord.ndim, int) + + def test_shape(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.shape == (9,) + + def test_format_index(self): + # TODO + pass + + def test_format_index_slice(self): + # TODO + pass + + def test_get_value(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.get_value(0) == 100.0 + assert coord.get_value(4) == 500.0 + assert coord.get_value(8) == 900.0 + assert coord.get_value(-1) == 900.0 + assert coord.get_value(-9) == 100.0 + assert np.allclose(coord.get_value([1, 2, 3, -2]), [200.0, 300.0, 400.0, 800.0]) + with pytest.raises(IndexError): + coord.get_value(-10) + coord.get_value(9) + coord.get_value(0.5) + starttime = np.datetime64("2000-01-01T00:00:00") + endtime = np.datetime64("2000-01-01T00:00:08") + coord = InterpCoordinate( + dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) + ) + assert coord.get_value(0) == starttime + assert coord.get_value(4) == np.datetime64("2000-01-01T00:00:04") + assert coord.get_value(8) == endtime + assert coord.get_value(-1) == endtime + assert coord.get_value(-9) == starttime + + def test_get_index(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.get_indexer(100.0) == 0 + assert coord.get_indexer(900.0) == 8 + assert coord.get_indexer(0.0, "nearest") == 0 + assert coord.get_indexer(1000.0, "nearest") == 8 + assert coord.get_indexer(125.0, "nearest") == 0 + assert coord.get_indexer(175.0, "nearest") == 1 + assert coord.get_indexer(175.0, "ffill") == 0 + assert coord.get_indexer(200.0, "ffill") == 1 + assert coord.get_indexer(200.0, "bfill") == 1 + assert coord.get_indexer(125.0, "bfill") == 1 + assert np.all(np.equal(coord.get_indexer([100.0, 900.0]), [0, 8])) + with pytest.raises(KeyError): + assert coord.get_indexer(0.0) == 0 + assert coord.get_indexer(1000.0) == 8 + assert coord.get_indexer(150.0) == 0 + assert coord.get_indexer(1000.0, "bfill") == 8 + assert coord.get_indexer(0.0, "ffill") == 0 + + starttime = np.datetime64("2000-01-01T00:00:00") + endtime = np.datetime64("2000-01-01T00:00:08") + coord = InterpCoordinate( + dict(tie_indices=[0, 8], tie_values=[starttime, endtime]) + ) + assert coord.get_indexer(starttime) == 0 + assert coord.get_indexer(endtime) == 8 + assert coord.get_indexer(str(starttime)) == 0 + assert coord.get_indexer(str(endtime)) == 8 + assert coord.get_indexer("2000-01-01T00:00:04.1", "nearest") == 4 + + def test_indices(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.all(np.equal(coord.indices, np.arange(9))) + + def test_values(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert np.allclose(coord.values, np.arange(100.0, 1000.0, 100.0)) + + def test_get_index_slice(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.slice_indexer(100.0, 200.0) == slice(0, 2) + assert coord.slice_indexer(150.0, 250.0) == slice(1, 2) + assert coord.slice_indexer(300.0, 500.0) == slice(2, 5) + assert coord.slice_indexer(0.0, 500.0) == slice(0, 5) + assert coord.slice_indexer(125.0, 175.0) == slice(1, 1) + assert coord.slice_indexer(0.0, 50.0) == slice(0, 0) + assert coord.slice_indexer(1000.0, 1100.0) == slice(9, 9) + assert coord.slice_indexer(1000.0, 500.0) == slice(9, 5) + assert coord.slice_indexer(None, None) == slice(None, None) + + def test_slice_index(self): + coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) + assert coord.slice_index(slice(0, 2)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 200.0])) + ) + assert coord.slice_index(slice(7, None)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) + ) + assert coord.slice_index(slice(None, None)).equals(coord) + assert coord.slice_index(slice(0, 0)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(4, 2)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(9, 9)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(3, 3)).equals( + InterpCoordinate(dict(tie_indices=[], tie_values=[])) + ) + assert coord.slice_index(slice(0, -1)).equals( + InterpCoordinate(dict(tie_indices=[0, 7], tie_values=[100.0, 800.0])) + ) + assert coord.slice_index(slice(0, -2)).equals( + InterpCoordinate(dict(tie_indices=[0, 6], tie_values=[100.0, 700.0])) + ) + assert coord.slice_index(slice(-2, None)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[800.0, 900.0])) + ) + assert coord.slice_index(slice(1, 2)).equals( + InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) + ) + assert coord.slice_index(slice(1, 3, 2)).equals( + InterpCoordinate(dict(tie_indices=[0], tie_values=[200.0])) + ) + assert coord.slice_index(slice(None, None, 2)).equals( + InterpCoordinate(dict(tie_indices=[0, 4], tie_values=[100.0, 900.0])) + ) + assert coord.slice_index(slice(None, None, 3)).equals( + InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 700.0])) + ) + assert coord.slice_index(slice(None, None, 4)).equals( + InterpCoordinate(dict(tie_indices=[0, 2], tie_values=[100.0, 900.0])) + ) + assert coord.slice_index(slice(None, None, 5)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[100.0, 600.0])) + ) + assert coord.slice_index(slice(2, 7, 3)).equals( + InterpCoordinate(dict(tie_indices=[0, 1], tie_values=[300.0, 600.0])) + ) + + def test_to_index(self): + # TODO + pass + + def test_simplify(self): + xp = np.sort(np.random.choice(10000, 1000, replace=False)) + xp[0] = 0 + xp[-1] = 10000 + yp = xp + (np.random.rand(1000) - 0.5) + coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) + assert len(coord.simplify(1.0).tie_indices) == 2 + + def test_singleton(self): + coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]}) + assert coord[0].values == 1.0 + + def test_to_from_dict(self): + for data in self.valid: + coord = InterpCoordinate(data) + assert InterpCoordinate.from_dict(coord.to_dict()).equals(coord) + + def test_append(self): + coord0 = InterpCoordinate() + coord1 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [0, 20]}) + coord2 = InterpCoordinate({"tie_indices": [0, 2], "tie_values": [30, 50]}) + + result = coord1.append(coord2).simplify() + expected = InterpCoordinate({"tie_indices": [0, 5], "tie_values": [0, 50]}) + assert result.equals(expected) + + result = coord2.append(coord1).simplify() + expected = InterpCoordinate( + {"tie_indices": [0, 2, 3, 5], "tie_values": [30, 50, 0, 20]} + ) + assert result.equals(expected) + + assert coord0.append(coord0).empty + assert coord0.append(coord1).equals(coord1) + assert coord1.append(coord0).equals(coord1) From b5e0132c1319a0622d08ca20ed883606d861065e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 16:19:33 +0100 Subject: [PATCH 043/104] Refactoring: merge tests with same name. --- tests/coordinates/test_coordinates.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/coordinates/test_coordinates.py b/tests/coordinates/test_coordinates.py index d0825bd4..9f87e336 100644 --- a/tests/coordinates/test_coordinates.py +++ b/tests/coordinates/test_coordinates.py @@ -13,12 +13,6 @@ def test_new(self): assert coord.isdense() assert coord.dim == "dim" - def test_to_dataarray(self): - coord = xd.Coordinate([1, 2, 3], "dim") - result = coord.to_dataarray() - expected = xd.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") - assert result.equals(expected) - def test_empty(self): with pytest.raises(TypeError, match="cannot infer coordinate type"): xd.Coordinate() @@ -44,6 +38,10 @@ def test_name(self): assert coords["other_dim"].name == "other_dim" def test_to_dataarray(self): + coord = xd.Coordinate([1, 2, 3], "dim") + result = coord.to_dataarray() + expected = xd.DataArray([1, 2, 3], {"dim": [1, 2, 3]}, name="dim") + assert result.equals(expected) coord = xd.Coordinate([1, 2, 3]) with pytest.raises(ValueError, match="unnamed coordinate"): coord.to_dataarray() From 09d837d5c55856666b73436842aa72c3cf6f10ab Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 18 Dec 2025 16:30:27 +0100 Subject: [PATCH 044/104] add Coordinate.get_div_points for more generic xdas.split. --- xdas/coordinates/core.py | 5 +++++ xdas/coordinates/interp.py | 6 ++++++ xdas/core/routines.py | 12 +----------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index d9e6813e..f8d0fa50 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -420,6 +420,11 @@ def issampled(self): def append(self, other): raise NotImplementedError(f"append is not implemented for {self.__class__}") + def get_div_points(self): + raise NotImplementedError( + f"get_div_points is not implemented for {self.__class__}" + ) + def to_dataarray(self): from ..core.dataarray import DataArray # TODO: avoid defered import? diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 6a3f7f60..d12a859d 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -292,6 +292,12 @@ def simplify(self, tolerance=None): dict(tie_indices=tie_indices, tie_values=tie_values), self.dim ) + def get_div_points(self): + (points,) = np.nonzero(np.diff(self.tie_indices, prepend=[0]) == 1) + div_points = [self.tie_indices[point] for point in points] + div_points = [0] + div_points + [len(self)] + return div_points + def get_discontinuities(self): """ Returns a DataFrame containing information about the discontinuities. diff --git a/xdas/core/routines.py b/xdas/core/routines.py index f34c4b8c..de810c8b 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -12,7 +12,6 @@ from tqdm import tqdm from ..coordinates.core import Coordinates, get_sampling_interval -from ..coordinates.interp import InterpCoordinate from ..virtual import VirtualSource, VirtualStack from .dataarray import DataArray from .datacollection import DataCollection, DataMapping, DataSequence @@ -779,16 +778,7 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None if isinstance(indices_or_sections, str) and ( indices_or_sections == "discontinuities" ): - if isinstance(da[dim], InterpCoordinate): - coord = da[dim].simplify(tolerance) - (points,) = np.nonzero(np.diff(coord.tie_indices, prepend=[0]) == 1) - div_points = [coord.tie_indices[point] for point in points] - div_points = [0] + div_points + [da.sizes[dim]] - else: - raise TypeError( - "discontinuities can only be found on dimension that have as type " - "`InterpCoordinate`." - ) + div_points = da[dim].simplify(tolerance).get_div_points() elif isinstance(indices_or_sections, int): nsamples = da.sizes[dim] nchunk = indices_or_sections From fa7d5bbece114140ea5485e3f87ca0c69ef38d6c Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 19 Dec 2025 12:26:30 +0100 Subject: [PATCH 045/104] WIP: make split on discontinuities work on any coord type. --- xdas/coordinates/core.py | 2 +- xdas/coordinates/dense.py | 11 +++++++++++ xdas/coordinates/interp.py | 7 +++++-- xdas/coordinates/sampled.py | 10 ++++++++++ xdas/core/routines.py | 2 +- 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index f8d0fa50..cd999da2 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -420,7 +420,7 @@ def issampled(self): def append(self, other): raise NotImplementedError(f"append is not implemented for {self.__class__}") - def get_div_points(self): + def get_div_points(self, tolerance=None): raise NotImplementedError( f"get_div_points is not implemented for {self.__class__}" ) diff --git a/xdas/coordinates/dense.py b/xdas/coordinates/dense.py index 9ebef73f..ff4d355c 100644 --- a/xdas/coordinates/dense.py +++ b/xdas/coordinates/dense.py @@ -71,6 +71,17 @@ def append(self, other): raise ValueError("cannot append coordinate with different dtype") return self.__class__(np.concatenate([self.data, other.data]), self.dim) + def get_div_points(self, tolerance=None): + deltas = np.diff(self.data) + if tolerance is not None: + div_points = np.nonzero(np.abs(deltas) >= tolerance)[0] + 1 + else: + raise NotImplementedError( + "get_div_points without tolerance is not implemented for DenseCoordinate" + ) + div_points = np.concatenate(([0], div_points, [len(self)])) + return div_points + def to_dict(self): if np.issubdtype(self.dtype, np.datetime64): data = self.data.astype(str).tolist() diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index d12a859d..40e85880 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -292,11 +292,14 @@ def simplify(self, tolerance=None): dict(tie_indices=tie_indices, tie_values=tie_values), self.dim ) - def get_div_points(self): + def get_div_points(self, tolerance=None): (points,) = np.nonzero(np.diff(self.tie_indices, prepend=[0]) == 1) + deltas = self.tie_values[points] - self.tie_values[points - 1] + if tolerance is not None: + points = points[np.abs(deltas) >= tolerance] div_points = [self.tie_indices[point] for point in points] div_points = [0] + div_points + [len(self)] - return div_points + return np.array(div_points) def get_discontinuities(self): """ diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 28fead89..40ac11eb 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -346,6 +346,16 @@ def simplify(self, tolerance=None): self.dim, ) + def get_div_points(self, tolerance=None): + div_points = self.tie_indices[1:] + if tolerance is not None: + deltas = self.tie_values[1:] - ( + self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] + ) + div_points = div_points[np.abs(deltas) >= tolerance] + div_points = np.concatenate(([0], div_points, [len(self)])) + return div_points + def get_discontinuities(self): if self.empty: return pd.DataFrame( diff --git a/xdas/core/routines.py b/xdas/core/routines.py index de810c8b..08933926 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -778,7 +778,7 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None if isinstance(indices_or_sections, str) and ( indices_or_sections == "discontinuities" ): - div_points = da[dim].simplify(tolerance).get_div_points() + div_points = da[dim].get_div_points(tolerance) elif isinstance(indices_or_sections, int): nsamples = da.sizes[dim] nchunk = indices_or_sections From ec256a4327b0acb235e00109a15492244cc67eee Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 19 Dec 2025 15:03:30 +0100 Subject: [PATCH 046/104] Add `Coordinate.get_split_indices` and move `get_availabilities` and `get_discontinuities` as a generic method in `Coordinate`. --- xdas/coordinates/core.py | 113 +++++++++++++++++++++++++++++++++++- xdas/coordinates/interp.py | 113 +++--------------------------------- xdas/coordinates/sampled.py | 70 ++-------------------- xdas/core/routines.py | 13 ++--- 4 files changed, 129 insertions(+), 180 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index cd999da2..a159a3ad 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -1,7 +1,9 @@ from copy import copy, deepcopy from functools import wraps +from itertools import pairwise import numpy as np +import pandas as pd def wraps_first_last(func): @@ -420,11 +422,118 @@ def issampled(self): def append(self, other): raise NotImplementedError(f"append is not implemented for {self.__class__}") - def get_div_points(self, tolerance=None): + def get_split_indices(self, tolerance=None): raise NotImplementedError( - f"get_div_points is not implemented for {self.__class__}" + f"get_split_indices is not implemented for {self.__class__}" ) + def get_discontinuities(self, tolerance=None): + """ + Returns a DataFrame containing information about the discontinuities. + + Returns + ------- + pandas.DataFrame + A DataFrame with the following columns: + + - start_index : int + The index where the discontinuity starts. + - end_index : int + The index where the discontinuity ends. + - start_value : float + The value at the start of the discontinuity. + - end_value : float + The value at the end of the discontinuity. + - delta : float + The difference between the end_value and start_value. + - type : str + The type of the discontinuity, either "gap" or "overlap". + + """ + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + indices = self.get_split_indices(tolerance) + records = [] + for index in indices: + start_index = index + end_index = index + 1 + start_value = self.get_value(index) + end_value = self.get_value(index + 1) + delta = end_value - start_value + if tolerance is not None and np.abs(delta) < tolerance: + continue + record = { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": ("gap" if end_value > start_value else "overlap"), + } + records.append(record) + return pd.DataFrame.from_records(records) + + def get_availabilities(self): + """ + Returns a DataFrame containing information about the data availability. + + Returns + ------- + pandas.DataFrame + A DataFrame with the following columns: + + - start_index : int + The index where the discontinuity starts. + - end_index : int + The index where the discontinuity ends. + - start_value : float + The value at the start of the discontinuity. + - end_value : float + The value at the end of the discontinuity. + - delta : float + The difference between the end_value and start_value. + - type : str + The type of the discontinuity, always "data". + + """ + if self.empty: + return pd.DataFrame( + columns=[ + "start_index", + "end_index", + "start_value", + "end_value", + "delta", + "type", + ] + ) + indices = np.concatenate([[0], self.get_split_indices(), [len(self)]]) + records = [] + for start_index, stop_index in pairwise(indices): + end_index = stop_index - 1 + start_value = self.get_value(start_index) + end_value = self.get_value(end_index) + records.append( + { + "start_index": start_index, + "end_index": end_index, + "start_value": start_value, + "end_value": end_value, + "delta": end_value - start_value, + "type": "data", + } + ) + return pd.DataFrame.from_records(records) + def to_dataarray(self): from ..core.dataarray import DataArray # TODO: avoid defered import? diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 40e85880..d7958499 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -292,112 +292,15 @@ def simplify(self, tolerance=None): dict(tie_indices=tie_indices, tie_values=tie_values), self.dim ) - def get_div_points(self, tolerance=None): - (points,) = np.nonzero(np.diff(self.tie_indices, prepend=[0]) == 1) - deltas = self.tie_values[points] - self.tie_values[points - 1] - if tolerance is not None: - points = points[np.abs(deltas) >= tolerance] - div_points = [self.tie_indices[point] for point in points] - div_points = [0] + div_points + [len(self)] - return np.array(div_points) - - def get_discontinuities(self): - """ - Returns a DataFrame containing information about the discontinuities. - - Returns - ------- - pandas.DataFrame - A DataFrame with the following columns: - - - start_index : int - The index where the discontinuity starts. - - end_index : int - The index where the discontinuity ends. - - start_value : float - The value at the start of the discontinuity. - - end_value : float - The value at the end of the discontinuity. - - delta : float - The difference between the end_value and start_value. - - type : str - The type of the discontinuity, either "gap" or "overlap". - - """ + def get_split_indices(self, tolerance=None): (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) - records = [] - for index in indices: - start_index = self.tie_indices[index] - end_index = self.tie_indices[index + 1] - start_value = self.tie_values[index] - end_value = self.tie_values[index + 1] - record = { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": ("gap" if end_value > start_value else "overlap"), - } - records.append(record) - return pd.DataFrame.from_records(records) - - def get_availabilities(self): - """ - Returns a DataFrame containing information about the data availability. - - Returns - ------- - pandas.DataFrame - A DataFrame with the following columns: - - - start_index : int - The index where the discontinuity starts. - - end_index : int - The index where the discontinuity ends. - - start_value : float - The value at the start of the discontinuity. - - end_value : float - The value at the end of the discontinuity. - - delta : float - The difference between the end_value and start_value. - - type : str - The type of the discontinuity, always "data". - - """ - if self.empty: - return pd.DataFrame( - columns=[ - "start_index", - "end_index", - "start_value", - "end_value", - "delta", - "type", - ] - ) - (cut_pos,) = np.nonzero(np.diff(self.tie_indices) == 1) - # start each segment after the previous cut (or at 0) - starts = np.concatenate(([0], cut_pos + 1)) - # end each segment at the cut position (or at n-1 for the last) - ends = np.concatenate((cut_pos, [len(self.tie_indices) - 1])) - records = [] - for s, e in zip(starts, ends): - start_index = self.tie_indices[s] - end_index = self.tie_indices[e] - start_value = self.tie_values[s] - end_value = self.tie_values[e] - records.append( - { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": "data", - } - ) - return pd.DataFrame.from_records(records) + indices += 1 + if tolerance is not None: + deltas = self.tie_values[indices + 1] - self.tie_values[indices] + indices = indices[np.abs(deltas) >= tolerance] + return np.array( + [self.tie_indices[index] for index in indices], dtype=self.tie_indices.dtype + ) @classmethod def from_array(cls, arr, dim=None, tolerance=None): diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 40ac11eb..d7677c99 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -346,76 +346,14 @@ def simplify(self, tolerance=None): self.dim, ) - def get_div_points(self, tolerance=None): - div_points = self.tie_indices[1:] + def get_split_indices(self, tolerance=None): + indices = self.tie_indices[1:] if tolerance is not None: deltas = self.tie_values[1:] - ( self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] ) - div_points = div_points[np.abs(deltas) >= tolerance] - div_points = np.concatenate(([0], div_points, [len(self)])) - return div_points - - def get_discontinuities(self): - if self.empty: - return pd.DataFrame( - columns=[ - "start_index", - "end_index", - "start_value", - "end_value", - "delta", - "type", - ] - ) - records = [] - for index in self.tie_indices[:-1]: - start_index = index - end_index = index + 1 - start_value = self.get_value(index) - end_value = self.get_value(index + 1) - record = { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": ("gap" if end_value > start_value else "overlap"), - } - records.append(record) - return pd.DataFrame.from_records(records) - - def get_availabilities(self): - if self.empty: - return pd.DataFrame( - columns=[ - "start_index", - "end_index", - "start_value", - "end_value", - "delta", - "type", - ] - ) - records = [] - for index, value, length in zip( - self.tie_indices, self.tie_values, self.tie_indices - ): - start_index = index - end_index = index + length - 1 - start_value = value - end_value = value + self.sampling_interval * (length - 1) - records.append( - { - "start_index": start_index, - "end_index": end_index, - "start_value": start_value, - "end_value": end_value, - "delta": end_value - start_value, - "type": "data", - } - ) - return pd.DataFrame.from_records(records) + indices = indices[np.abs(deltas) <= tolerance] + return indices @classmethod def from_array(cls, arr, dim=None, sampling_interval=None): diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 08933926..c3a75dcb 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -4,6 +4,7 @@ from collections import defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed from glob import glob +from itertools import pairwise import numpy as np import pandas as pd @@ -778,8 +779,9 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None if isinstance(indices_or_sections, str) and ( indices_or_sections == "discontinuities" ): - div_points = da[dim].get_div_points(tolerance) - elif isinstance(indices_or_sections, int): + indices_or_sections = da[dim].get_split_indices(tolerance) + + if isinstance(indices_or_sections, int): nsamples = da.sizes[dim] nchunk = indices_or_sections if nchunk <= 0: @@ -790,12 +792,9 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None chunks = extras * [chunk_size + 1] + (nchunk - extras) * [chunk_size] div_points = np.cumsum([0] + chunks, dtype=np.int64) else: - div_points = [0] + indices_or_sections + [da.sizes[dim]] + div_points = np.concatenate([[0], indices_or_sections, [da.sizes[dim]]]) return DataCollection( - [ - da.isel({dim: slice(div_points[idx], div_points[idx + 1])}) - for idx in range(len(div_points) - 1) - ] + [da.isel({dim: slice(start, stop)}) for start, stop in pairwise(div_points)] ) From 44c4b2484fc25372d97623d1d8a085ec085e404c Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 19 Dec 2025 16:36:19 +0100 Subject: [PATCH 047/104] Fix SampledCoordinate tolerance and cast. --- xdas/coordinates/sampled.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index d7677c99..ab89f128 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -113,7 +113,10 @@ def issampled(self): return True def get_sampling_interval(self, cast=True): - return self.sampling_interval + delta = self.sampling_interval + if cast and np.issubdtype(delta.dtype, np.timedelta64): + delta = delta / np.timedelta64(1, "s") + return delta def __len__(self): if self.empty: @@ -328,6 +331,8 @@ def decimate(self, q): return self[::q] def simplify(self, tolerance=None): + if tolerance is None: + tolerance = np.array(0, dtype=self.sampling_interval.dtype) tie_values = [self.tie_values[0]] tie_lengths = [self.tie_lengths[0]] for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): From c07519116f629c190162742c77fcd4306cfc5fe4 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sat, 20 Dec 2025 18:40:56 +0100 Subject: [PATCH 048/104] Add Coordinare[type].from_block. --- xdas/coordinates/core.py | 14 ++++++++++++++ xdas/coordinates/default.py | 2 +- xdas/coordinates/dense.py | 7 ++++++- xdas/coordinates/interp.py | 12 +++++++++++- xdas/coordinates/sampled.py | 22 ++++++++++++++++------ 5 files changed, 48 insertions(+), 9 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index a159a3ad..484a8fca 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -250,6 +250,16 @@ def _assign_parent(self, parent): class Coordinate: + _registry = {} + + def __init_subclass__(cls, *, name=None, **kwargs): + super().__init_subclass__(**kwargs) + if name is not None: + Coordinate._registry[name] = cls + + def __class_getitem__(cls, item): + return cls._registry[item] + def __new__(cls, data=None, dim=None, dtype=None): if data is None: raise TypeError("cannot infer coordinate type if no `data` is provided") @@ -580,6 +590,10 @@ def from_dataset(cls, dataset, name): coords |= subcls.from_dataset(dataset, name) return coords + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + raise NotImplementedError + def parse(data, dim=None): if isinstance(data, tuple): diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py index 8b11220b..80234791 100644 --- a/xdas/coordinates/default.py +++ b/xdas/coordinates/default.py @@ -3,7 +3,7 @@ from .core import Coordinate, isscalar, parse -class DefaultCoordinate(Coordinate): +class DefaultCoordinate(Coordinate, name="default"): def __new__(cls, *args, **kwargs): return object.__new__(cls) diff --git a/xdas/coordinates/dense.py b/xdas/coordinates/dense.py index ff4d355c..698bdb24 100644 --- a/xdas/coordinates/dense.py +++ b/xdas/coordinates/dense.py @@ -4,7 +4,7 @@ from .core import Coordinate, parse -class DenseCoordinate(Coordinate): +class DenseCoordinate(Coordinate, name="dense"): def __new__(cls, *args, **kwargs): return object.__new__(cls) @@ -106,3 +106,8 @@ def from_dataset(cls, dataset, name): ) for name, coord in dataset[name].coords.items() } + + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + data = start + step * np.arange(size) + return cls(data, dim=dim, dtype=dtype) diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index d7958499..e95124d5 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -7,7 +7,7 @@ from .core import Coordinate, format_datetime, is_strictly_increasing, parse -class InterpCoordinate(Coordinate): +class InterpCoordinate(Coordinate, name="interpolated"): """ Array-like object used to represent piecewise evenly spaced coordinates using the CF convention. @@ -356,6 +356,16 @@ def from_dataset(cls, dataset, name): coords[dim] = Coordinate(data, dim) return coords + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + return cls( + { + "tie_indices": [0, size - 1], + "tie_values": [start, start + step * (size - 1)], + }, + dim=dim, + ) + def douglas_peucker(x, y, epsilon): mask = np.ones(len(x), dtype=bool) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index ab89f128..3fc48762 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -1,12 +1,11 @@ import re import numpy as np -import pandas as pd from .core import Coordinate, format_datetime, is_strictly_increasing, parse -class SampledCoordinate(Coordinate): +class SampledCoordinate(Coordinate, name="sampled"): """ A coordinate that is sampled at regular intervals. @@ -142,10 +141,10 @@ def __repr__(self): def __getitem__(self, item): if isinstance(item, slice): return self.slice_index(item) - elif np.isscalar(item): - return Coordinate(self.get_value(item), None) else: - return Coordinate(self.get_value(item), self.dim) + return Coordinate( + self.get_value(item), None if np.isscalar(item) else self.dim + ) def __add__(self, other): return self.__class__( @@ -280,7 +279,9 @@ def get_indexer(self, value, method=None): # Check that value lies within the coordinate value range (vectorized) if np.any(value < self.start) or np.any(value >= self.end): raise KeyError("index not found") - if not is_strictly_increasing(self.tie_values): + if not is_strictly_increasing( + self.tie_values + ): # TODO: make it work even in this case raise ValueError("tie_values must be strictly increasing") reference = np.searchsorted(self.tie_values, value, side="right") - 1 offset = (value - self.tie_values[reference]) / self.sampling_interval @@ -417,3 +418,12 @@ def from_dataset(cls, dataset, name): } coords[dim] = Coordinate(data, dim) return coords + + @classmethod + def from_block(cls, start, size, step, dim=None, dtype=None): + data = { + "tie_values": [start], + "tie_lengths": [size], + "sampling_interval": step, + } + return cls(data, dim=dim, dtype=dtype) From 902dc662c1789702deb4827c49c227e5a17d52ce Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Sat, 20 Dec 2025 18:57:25 +0100 Subject: [PATCH 049/104] WIP: add `ctype` kwargs to read functions for Coordinate type selection. --- xdas/io/apsensing.py | 9 +++++---- xdas/io/asn.py | 8 ++++---- xdas/io/febus.py | 20 +++++++++++-------- xdas/io/miniseed.py | 46 ++++++++++++++++---------------------------- xdas/io/optasense.py | 10 +++++++--- xdas/io/silixa.py | 14 ++++++-------- xdas/io/sintela.py | 10 +++++++--- xdas/io/terra15.py | 9 +++++---- 8 files changed, 63 insertions(+), 63 deletions(-) diff --git a/xdas/io/apsensing.py b/xdas/io/apsensing.py index be03a7f1..937b165e 100644 --- a/xdas/io/apsensing.py +++ b/xdas/io/apsensing.py @@ -1,11 +1,12 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource -def read(fname): +def read(fname, ctype="interpolated"): with h5py.File(fname, "r") as file: t0 = file["Metadata"]["Timestamp"][()].item().decode() fs = file["DAQ"]["RepetitionFrequency"][()].item() @@ -18,6 +19,6 @@ def read(fname): t0 = np.datetime64(t0) dt = np.timedelta64(round(1e9 / fs), "ns") nt, nd = data.shape - t = {"tie_indices": [0, nt - 1], "tie_values": [t0, t0 + (nt - 1) * dt]} - d = {"tie_indices": [0, nd - 1], "tie_values": [0.0, (nd - 1) * dx]} - return DataArray(data, {"time": t, "distance": d}) + time = Coordinate[ctype].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype].from_block(0.0, nd, dx, dim="distance") + return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/io/asn.py b/xdas/io/asn.py index 44da7a5d..3e935364 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -4,12 +4,12 @@ import numpy as np import zmq -from ..coordinates.core import get_sampling_interval +from ..coordinates.core import Coordinate, get_sampling_interval from ..core.dataarray import DataArray from ..virtual import VirtualSource -def read(fname): +def read(fname, ctype="interpolated"): with h5py.File(fname, "r") as file: header = file["header"] t0 = np.datetime64(round(header["time"][()] * 1e9), "ns") @@ -17,8 +17,8 @@ def read(fname): dx = header["dx"][()] * np.median(np.diff(header["channels"])) data = VirtualSource(file["data"]) nt, nx = data.shape - time = {"tie_indices": [0, nt - 1], "tie_values": [t0, t0 + (nt - 1) * dt]} - distance = {"tie_indices": [0, nx - 1], "tie_values": [0.0, (nx - 1) * dx]} + time = Coordinate[ctype].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype].from_block(0.0, nx, dx, dim="distance") return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/io/febus.py b/xdas/io/febus.py index 3488352d..747b174e 100644 --- a/xdas/io/febus.py +++ b/xdas/io/febus.py @@ -3,12 +3,13 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..core.routines import concatenate from ..virtual import VirtualSource -def read(fname, overlaps=None, offset=None): +def read(fname, overlaps=None, offset=None, ctype="interpolated"): """ Open a Febus file into a xdas DataArray object. @@ -87,16 +88,19 @@ def read(fname, overlaps=None, offset=None): dt, dx = delta _, nt, nx = chunks.shape + dt = np.rint(1e6 * dt).astype("m8[us]").astype("m8[ns]") + dc = [] for t0, chunk in zip(times, chunks): - time = { - "tie_indices": [0, nt - 1], - "tie_values": np.rint(1e6 * np.array([t0, t0 + (nt - 1) * dt])) - .astype("M8[us]") - .astype("M8[ns]"), - } - distance = {"tie_indices": [0, nx - 1], "tie_values": [0.0, (nx - 1) * dx]} + + t0 = np.rint(1e6 * t0).astype("M8[us]").astype("M8[ns]") + time = Coordinate[ctype].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype].from_block(0.0, nx, dx, dim="distance") da = DataArray(chunk, {"time": time, "distance": distance}, name=name) dc.append(da) return concatenate(dc, "time") + + +def _to_datetime64(data): + return diff --git a/xdas/io/miniseed.py b/xdas/io/miniseed.py index 6a2d4691..375e25dc 100644 --- a/xdas/io/miniseed.py +++ b/xdas/io/miniseed.py @@ -6,15 +6,15 @@ from ..core.dataarray import DataArray -def read(fname, ignore_last_sample=False): - shape, dtype, coords, method = read_header(fname, ignore_last_sample) +def read(fname, ignore_last_sample=False, ctype="interpolated"): + shape, dtype, coords, method = read_header(fname, ignore_last_sample, ctype) data = dask.array.from_delayed( dask.delayed(read_data)(fname, method, ignore_last_sample), shape, dtype ) return DataArray(data, coords) -def read_header(path, ignore_last_sample): +def read_header(path, ignore_last_sample, ctype): st = obspy.read(path, headonly=True) dtype = uniquifiy(tr.data.dtype for tr in st) @@ -33,16 +33,20 @@ def read_header(path, ignore_last_sample): tmp_st = st.select(channel=channels[0]) for n, tr in enumerate(tmp_st): if n == 0: - time = get_time_coord(tr, ignore_last_sample=False) + time = get_time_coord(tr, ignore_last_sample=False, ctype=ctype) elif n == len(tmp_st) - 1: - time = time.append(get_time_coord(tr, ignore_last_sample)) + time = time.append(get_time_coord(tr, ignore_last_sample, ctype=ctype)) else: - time = time.append(get_time_coord(tr, ignore_last_sample=False)) + time = time.append( + get_time_coord(tr, ignore_last_sample=False, ctype=ctype) + ) else: method = "synchronized" - time = get_time_coord(st[0], ignore_last_sample) + time = get_time_coord(st[0], ignore_last_sample, ctype) - if not all(get_time_coord(tr, ignore_last_sample).equals(time) for tr in st): + if not all( + get_time_coord(tr, ignore_last_sample, ctype).equals(time) for tr in st + ): raise ValueError("All traces must be synchronized") network = uniquifiy(tr.stats.network for tr in st) @@ -85,27 +89,11 @@ def read_data(path, method, ignore_last_sample): return np.array(data) -def get_time_coord(tr, ignore_last_sample): - if ignore_last_sample: - return Coordinate( - { - "tie_indices": [0, tr.stats.npts - 2], - "tie_values": [ - np.datetime64(tr.stats.starttime), - np.datetime64(tr.stats.endtime - tr.stats.delta), - ], - } - ) - else: - return Coordinate( - { - "tie_indices": [0, tr.stats.npts - 1], - "tie_values": [ - np.datetime64(tr.stats.starttime), - np.datetime64(tr.stats.endtime), - ], - } - ) +def get_time_coord(tr, ignore_last_sample, ctype): + t0 = np.datetime64(tr.stats.starttime) + dt = np.rint(1e6 * tr.stats.delta).astype("m8[us]").astype("m8[ns]") + nt = tr.stats.npts - int(ignore_last_sample) + return Coordinate[ctype].from_block(t0, nt, dt, dim="time") def uniquifiy(seq): diff --git a/xdas/io/optasense.py b/xdas/io/optasense.py index 97d92084..16e175cf 100644 --- a/xdas/io/optasense.py +++ b/xdas/io/optasense.py @@ -1,11 +1,12 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource -def read(fname): +def read(fname, ctype="interpolated"): with h5py.File(fname, "r") as file: acquisition = file["Acquisition"] dx = acquisition.attrs["SpatialSamplingInterval"] @@ -14,6 +15,9 @@ def read(fname): tend = np.datetime64(rawdata.attrs["PartEndTime"][:-1]) data = VirtualSource(rawdata) nd, nt = data.shape - time = {"tie_indices": [0, nt - 1], "tie_values": [tstart, tend]} - distance = {"tie_indices": [0, nd - 1], "tie_values": [0.0, (nd - 1) * dx]} + time = { + "tie_indices": [0, nt - 1], + "tie_values": [tstart, tend], + } # TODO: use from_block + distance = Coordinate[ctype].from_block(0.0, nd, dx, dim="distance") return DataArray(data, {"distance": distance, "time": time}) diff --git a/xdas/io/silixa.py b/xdas/io/silixa.py index f7955834..ae97867e 100644 --- a/xdas/io/silixa.py +++ b/xdas/io/silixa.py @@ -1,31 +1,29 @@ import dask import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from .tdms import TdmsReader -def read(fname): - shape, dtype, coords = read_header(fname) +def read(fname, ctype="interpolated"): + shape, dtype, coords = read_header(fname, ctype) data = dask.array.from_delayed(dask.delayed(read_data)(fname), shape, dtype) return DataArray(data, coords) -def read_header(fname): +def read_header(fname, ctype): with TdmsReader(fname) as tdms: props = tdms.get_properties() shape = tdms.channel_length, tdms.fileinfo["n_channels"] dtype = tdms._data_type t0 = np.datetime64(props["GPSTimeStamp"]) dt = np.timedelta64(round(1e9 / props["SamplingFrequency[Hz]"]), "ns") - time = { - "tie_indices": [0, shape[0] - 1], - "tie_values": [t0, t0 + dt * (shape[0] - 1)], - } + time = Coordinate[ctype].from_block(t0, shape[0], dt, dim="time") distance = { "tie_indices": [0, shape[1] - 1], "tie_values": [props["Start Distance (m)"], props["Stop Distance (m)"]], - } + } # TODO: use from_block coords = {"time": time, "distance": distance} return shape, dtype, coords diff --git a/xdas/io/sintela.py b/xdas/io/sintela.py index ba6fc865..605f50fd 100644 --- a/xdas/io/sintela.py +++ b/xdas/io/sintela.py @@ -1,11 +1,12 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource -def read(fname): +def read(fname, ctype="interpolated"): with h5py.File(fname, "r") as file: acquisition = file["Acquisition"] dx = acquisition.attrs["SpatialSamplingInterval"] @@ -14,6 +15,9 @@ def read(fname): tend = np.datetime64(rawdata.attrs["PartEndTime"].decode().split("+")[0]) data = VirtualSource(rawdata) nt, nd = data.shape - time = {"tie_indices": [0, nt - 1], "tie_values": [tstart, tend]} - distance = {"tie_indices": [0, nd - 1], "tie_values": [0.0, (nd - 1) * dx]} + time = { + "tie_indices": [0, nt - 1], + "tie_values": [tstart, tend], + } # TODO: use from_block + distance = Coordinate[ctype].from_block(0.0, nd, dx, dim="distance") return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/io/terra15.py b/xdas/io/terra15.py index 7c969acf..e6cf2426 100644 --- a/xdas/io/terra15.py +++ b/xdas/io/terra15.py @@ -3,11 +3,12 @@ import h5py import numpy as np +from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource -def read(fname, tz=timezone.utc): +def read(fname, tz=timezone.utc, ctype="interpolated"): with h5py.File(fname, "r") as file: ti = np.datetime64( datetime.fromtimestamp(file["data_product"]["gps_time"][0], tz=tz) @@ -19,6 +20,6 @@ def read(fname, tz=timezone.utc): dx = file.attrs["dx"] data = VirtualSource(file["data_product"]["data"]) nt, nd = data.shape - t = {"tie_indices": [0, nt - 1], "tie_values": [ti, tf]} - d = {"tie_indices": [0, nd - 1], "tie_values": [d0, d0 + (nd - 1) * dx]} - return DataArray(data, {"time": t, "distance": d}) + time = {"tie_indices": [0, nt - 1], "tie_values": [ti, tf]} # TODO: use from_block + ctype = Coordinate[ctype].from_block(d0, nd, dx, dim="distance") + return DataArray(data, {"time": time, "distance": ctype}) From c9b3358445bcbf29a0b0de1ab531c2e21860604f Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 24 Dec 2025 17:23:19 +0100 Subject: [PATCH 050/104] Fixing stepped slicing in SampledCoordinate and add tests. --- tests/coordinates/test_sampled.py | 46 +++++++++++++++++++++++++++++-- xdas/coordinates/core.py | 2 +- xdas/coordinates/sampled.py | 42 ++++++++++++++-------------- 3 files changed, 65 insertions(+), 25 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 14ea457d..26a24821 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -109,14 +109,26 @@ def test_get_value_scalar_and_vector(self): assert coord.get_value(2) == 2.0 assert coord.get_value(3) == 10.0 assert coord.get_value(4) == 11.0 + # negative index + assert coord.get_value(-1) == 11.0 + assert coord.get_value(-2) == 10.0 + assert coord.get_value(-3) == 2.0 + assert coord.get_value(-4) == 1.0 + assert coord.get_value(-5) == 0.0 # vectorized - vals = coord.get_value([0, 2, 3, 4]) - assert np.array_equal(vals, np.array([0.0, 2.0, 10.0, 11.0])) + vals = coord.get_value([0, 1, 2, 3, 4, -5, -4, -3, -2, -1]) + assert np.array_equal( + vals, np.array([0.0, 1.0, 2.0, 10.0, 11.0, 0.0, 1.0, 2.0, 10.0, 11.0]) + ) # bounds with pytest.raises(IndexError): coord.get_value(-6) with pytest.raises(IndexError): coord.get_value(5) + with pytest.raises(IndexError): + coord.get_value([0, 5]) + with pytest.raises(IndexError): + coord.get_value([-6, 0]) def test_getitem(self): coord = self.make_coord() @@ -127,6 +139,36 @@ def test_getitem(self): # slice -> SampledCoordinate or compatible sub = coord[1:4] assert isinstance(sub, SampledCoordinate) + assert np.array_equal(sub.values, np.array([1.0, 2.0, 10.0])) + # slice negative + sub_neg = coord[-4:-1] + assert isinstance(sub_neg, SampledCoordinate) + assert np.array_equal(sub_neg.values, np.array([1.0, 2.0, 10.0])) + # full slice + full = coord[:] + assert full.equals(coord) + # None bound indexing + none_start = coord[None:3] + assert isinstance(none_start, SampledCoordinate) + assert np.array_equal(none_start.values, np.array([0.0, 1.0, 2.0])) + none_end = coord[2:None] + assert isinstance(none_end, SampledCoordinate) + assert np.array_equal(none_end.values, np.array([2.0, 10.0, 11.0])) + # step slice -> SampledCoordinate + step = coord[::2] + assert isinstance(step, SampledCoordinate) + assert np.array_equal(step.values, np.array([0.0, 2.0, 11.0])) + # step slice with start/stop + step_ss = coord[1:5:2] + assert isinstance(step_ss, SampledCoordinate) + assert np.array_equal(step_ss.values, np.array([1.0, 10.0])) + # negative step slice with start/stop + step_ss_neg = coord[-4:-1:2] + assert isinstance(step_ss_neg, SampledCoordinate) + assert np.array_equal(step_ss_neg.values, np.array([1.0, 10.0])) + # negative step slice -> raise NotImplementedError + with pytest.raises(NotImplementedError): + coord[::-1] # array -> DenseCoordinate of values arr = coord[[0, 4]] assert isinstance(arr, DenseCoordinate) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 484a8fca..eb107c0d 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -370,7 +370,7 @@ def format_index(self, idx, bounds="raise"): idx = np.clip(idx, 0, len(self)) return idx - def format_index_slice(self, slc): + def format_index_slice(self, slc): # TODO: use slice.indices instead start = slc.start stop = slc.stop step = slc.step diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 3fc48762..82284b1e 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -235,33 +235,31 @@ def get_value(self, index): ) def slice_index(self, index_slice): - index_slice = self.format_index_slice(index_slice) + # normalize slice + start, stop, step = index_slice.indices(len(self)) - # TODO: optimize when start and/or stop are None + if step < 0: + raise NotImplementedError("negative slice step is not implemented") - # get indices relative to tie points - relative_start_index = np.clip( - index_slice.start - self.tie_indices, 0, self.tie_lengths - ) - relative_stop_index = np.clip( - index_slice.stop - self.tie_indices, 0, self.tie_lengths - ) + # align stop + stop += (start - stop) % step # TODO: check for negative step - # keep segments with data - mask = relative_start_index < relative_stop_index + # get relative start and stop within each tie + q, r = np.divmod(start - self.tie_indices, step) + lo = np.maximum(q, 0) * step + r - # compute new tie points ane lengths - tie_values = ( - self.tie_values[mask] + relative_start_index[mask] * self.sampling_interval - ) - tie_lengths = relative_stop_index[mask] - relative_start_index[mask] + q, r = np.divmod(self.tie_indices + self.tie_lengths - stop, step) + hi = self.tie_lengths - np.maximum(q, 0) * step + r - # adjust for step if needed - if index_slice.step == 1: - sampling_interval = self.sampling_interval - else: - tie_lengths = (self.tie_lengths + index_slice.step - 1) // index_slice.step - sampling_interval = self.sampling_interval * index_slice.step + # filter empty segments + mask = hi > lo + lo = lo[mask] + hi = hi[mask] + + # compute new tie values, tie lengths and sampling interval + tie_values = self.tie_values[mask] + lo * self.sampling_interval + tie_lengths = (hi - lo) // step + sampling_interval = self.sampling_interval * step # build new coordinate data = { From be45e77cd58033746afc70873b529a0b5d941ea8 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 24 Dec 2025 17:34:37 +0100 Subject: [PATCH 051/104] Remove unused format_index_slice method and related test. Use slice.indices instead. --- docs/api/xdas.md | 1 - tests/coordinates/test_interp.py | 4 ---- tests/coordinates/test_sampled.py | 1 + xdas/coordinates/core.py | 18 ------------------ xdas/coordinates/interp.py | 9 +++------ xdas/coordinates/sampled.py | 4 +--- 6 files changed, 5 insertions(+), 32 deletions(-) diff --git a/docs/api/xdas.md b/docs/api/xdas.md index 3fcfd50f..118b079b 100644 --- a/docs/api/xdas.md +++ b/docs/api/xdas.md @@ -318,7 +318,6 @@ Methods InterpCoordinate.get_value InterpCoordinate.format_index InterpCoordinate.slice_index - InterpCoordinate.format_index_slice InterpCoordinate.get_indexer InterpCoordinate.slice_indexer InterpCoordinate.decimate diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py index 95b1d77c..f9ef27a1 100644 --- a/tests/coordinates/test_interp.py +++ b/tests/coordinates/test_interp.py @@ -147,10 +147,6 @@ def test_format_index(self): # TODO pass - def test_format_index_slice(self): - # TODO - pass - def test_get_value(self): coord = InterpCoordinate({"tie_indices": [0, 8], "tie_values": [100.0, 900.0]}) assert coord.get_value(0) == 100.0 diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 26a24821..9ee05ddd 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -191,6 +191,7 @@ def test_slice_negative_and_out_of_bounds(self): # negative slice indices s = coord[-4:-1] assert isinstance(s, SampledCoordinate) + assert np.array_equal(s.values, np.array([1.0, 2.0, 10.0])) # slice that extends beyond bounds should clip s2 = coord[-10:10] assert s2.equals(coord) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index eb107c0d..6ee921b4 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -370,24 +370,6 @@ def format_index(self, idx, bounds="raise"): idx = np.clip(idx, 0, len(self)) return idx - def format_index_slice(self, slc): # TODO: use slice.indices instead - start = slc.start - stop = slc.stop - step = slc.step - if start is None: - start = 0 - if stop is None: - stop = len(self) - if step is None: - step = 1 - if step <= 0: - raise NotImplementedError( - "negative or zero step when slicing is not supported yet" - ) - start = self.format_index(start, bounds="clip") - stop = self.format_index(stop, bounds="clip") - return slice(start, stop, step) - def slice_indexer(self, start=None, stop=None, step=None, endpoint=True): if start is not None: try: diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index e95124d5..661c4d09 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -190,12 +190,9 @@ def get_value(self, index): return forward(index, self.tie_indices, self.tie_values) def slice_index(self, index_slice): - index_slice = self.format_index_slice(index_slice) - start_index, stop_index, step_index = ( - index_slice.start, - index_slice.stop, - index_slice.step, - ) + start_index, stop_index, step_index = index_slice.indices(len(self)) + if step_index < 0: + raise NotImplementedError("negative slice step is not implemented") if stop_index - start_index <= 0: return self.__class__(dict(tie_indices=[], tie_values=[]), dim=self.dim) elif (stop_index - start_index) <= step_index: diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 82284b1e..8945755d 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -226,9 +226,7 @@ def equals(self, other): ) def get_value(self, index): - index = self.format_index(index) - if np.any(index < 0) or np.any(index >= len(self)): - raise IndexError("index is out of bounds") + index = self.format_index(index, bounds="raise") reference = np.searchsorted(self.tie_indices, index, side="right") - 1 return self.tie_values[reference] + ( (index - self.tie_indices[reference]) * self.sampling_interval From 6f9716d0c88c02faa607e3e7c379f3b6bb1eb397 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 7 Jan 2026 14:49:35 +0100 Subject: [PATCH 052/104] Refactor coordinates method order. --- xdas/coordinates/default.py | 59 ++++++++++++++---------- xdas/coordinates/dense.py | 13 ++++-- xdas/coordinates/interp.py | 89 +++++++++++++++++++----------------- xdas/coordinates/sampled.py | 90 ++++++++++++++++++------------------- xdas/coordinates/scalar.py | 6 +-- 5 files changed, 142 insertions(+), 115 deletions(-) diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py index 80234791..df24191a 100644 --- a/xdas/coordinates/default.py +++ b/xdas/coordinates/default.py @@ -8,16 +8,47 @@ def __new__(cls, *args, **kwargs): return object.__new__(cls) def __init__(self, data=None, dim=None, dtype=None): + # empty if data is None: data = {"size": 0} + + # parse data data, dim = parse(data, dim) if not self.isvalid(data): raise TypeError("`data` must be a mapping {'size': }") + + # check dtype if dtype is not None: raise ValueError("`dtype` is not supported for DefaultCoordinate") + + # store data self.data = data self.dim = dim + @property + def empty(self): + return bool(self.data["size"]) + + @property + def dtype(self): + return np.int64 + + @property + def ndim(self): + return 1 + + @property + def shape(self): + return (len(self),) + + @staticmethod + def isvalid(data): + match data: + case {"size": None | int(_)}: + return True + case _: + return False + def __len__(self): if self.data["size"] is None: return 0 @@ -32,33 +63,15 @@ def __getitem__(self, item): def __array__(self, dtype=None): return np.arange(self.data["size"], dtype=dtype) - @staticmethod - def isvalid(data): - match data: - case {"size": None | int(_)}: - return True - case _: - return False + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + raise NotImplementedError + + def __array_function__(self, func, types, args, kwargs): + raise NotImplementedError def isdefault(self): return True - @property - def empty(self): - return bool(self.data["size"]) - - @property - def dtype(self): - return np.int64 - - @property - def ndim(self): - return 1 - - @property - def shape(self): - return (len(self),) - def get_sampling_interval(self, cast=True): return 1 diff --git a/xdas/coordinates/dense.py b/xdas/coordinates/dense.py index 698bdb24..ff8c5357 100644 --- a/xdas/coordinates/dense.py +++ b/xdas/coordinates/dense.py @@ -9,14 +9,23 @@ def __new__(cls, *args, **kwargs): return object.__new__(cls) def __init__(self, data=None, dim=None, dtype=None): + # empty if data is None: data = [] + + # parse data data, dim = parse(data, dim) if not self.isvalid(data): raise TypeError("`data` must be array-like") + + # store data self.data = np.asarray(data, dtype=dtype) self.dim = dim + @property + def index(self): + return pd.Index(self.data) + @staticmethod def isvalid(data): data = np.asarray(data) @@ -25,10 +34,6 @@ def isvalid(data): def isdense(self): return True - @property - def index(self): - return pd.Index(self.data) - def equals(self, other): if isinstance(other, self.__class__): return ( diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 661c4d09..741e4b43 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -29,8 +29,11 @@ def __new__(cls, *args, **kwargs): return object.__new__(cls) def __init__(self, data=None, dim=None, dtype=None): + # empty if data is None: data = {"tie_indices": [], "tie_values": []} + + # parse data data, dim = parse(data, dim) if not self.__class__.isvalid(data): raise TypeError("`data` must be dict-like") @@ -40,12 +43,16 @@ def __init__(self, data=None, dim=None, dtype=None): ) tie_indices = np.asarray(data["tie_indices"]) tie_values = np.asarray(data["tie_values"], dtype=dtype) + + # check shapes if not tie_indices.ndim == 1: raise ValueError("`tie_indices` must be 1D") if not tie_values.ndim == 1: raise ValueError("`tie_values` must be 1D") if not len(tie_indices) == len(tie_values): raise ValueError("`tie_indices` and `tie_values` must have the same length") + + # check dtypes if not tie_indices.shape == (0,): if not np.issubdtype(tie_indices.dtype, np.integer): raise ValueError("`tie_indices` must be integer-like") @@ -58,10 +65,50 @@ def __init__(self, data=None, dim=None, dtype=None): or np.issubdtype(tie_values.dtype, np.datetime64) ): raise ValueError("`tie_values` must have either numeric or datetime dtype") + + # store data tie_indices = tie_indices.astype(int) self.data = dict(tie_indices=tie_indices, tie_values=tie_values) self.dim = dim + @property + def tie_indices(self): + return self.data["tie_indices"] + + @property + def tie_values(self): + return self.data["tie_values"] + + @property + def dtype(self): + return self.tie_values.dtype + + @property + def empty(self): + return self.tie_indices.shape == (0,) + + @property + def ndim(self): + return self.tie_values.ndim + + @property + def shape(self): + return (len(self),) + + @property + def indices(self): + if self.empty: + return np.array([], dtype="int") + else: + return np.arange(self.tie_indices[-1] + 1) + + @property + def values(self): + if self.empty: + return np.array([], dtype=self.dtype) + else: + return self.get_value(self.indices) + @staticmethod def isvalid(data): match data: @@ -70,9 +117,6 @@ def isvalid(data): case _: return False - def isinterp(self): - return True - def __len__(self): if self.empty: return 0 @@ -126,43 +170,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def __array_function__(self, func, types, args, kwargs): raise NotImplementedError - @property - def tie_indices(self): - return self.data["tie_indices"] - - @property - def tie_values(self): - return self.data["tie_values"] - - @property - def empty(self): - return self.tie_indices.shape == (0,) - - @property - def dtype(self): - return self.tie_values.dtype - - @property - def ndim(self): - return self.tie_values.ndim - - @property - def shape(self): - return (len(self),) - - @property - def indices(self): - if self.empty: - return np.array([], dtype="int") - else: - return np.arange(self.tie_indices[-1] + 1) - - @property - def values(self): - if self.empty: - return np.array([], dtype=self.dtype) - else: - return self.get_value(self.indices) + def isinterp(self): + return True def get_sampling_interval(self, cast=True): if len(self) < 2: diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 8945755d..05d32535 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -96,6 +96,44 @@ def sampling_interval(self): def dtype(self): return self.tie_values.dtype + @property + def tie_indices(self): + return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) + + @property + def empty(self): + return self.tie_values.shape == (0,) + + @property + def ndim(self): + return self.tie_values.ndim + + @property + def shape(self): + return (len(self),) + + @property + def indices(self): + if self.empty: + return np.array([], dtype="int") + else: + return np.arange(len(self)) + + @property + def values(self): + if self.empty: + return np.array([], dtype=self.dtype) + else: + return self.get_value(self.indices) + + @property + def start(self): + return self.tie_values[0] + + @property + def end(self): + return self.tie_values[-1] + self.sampling_interval * self.tie_lengths[-1] + @staticmethod def isvalid(data): match data: @@ -108,15 +146,6 @@ def isvalid(data): case _: return False - def issampled(self): - return True - - def get_sampling_interval(self, cast=True): - delta = self.sampling_interval - if cast and np.issubdtype(delta.dtype, np.timedelta64): - delta = delta / np.timedelta64(1, "s") - return delta - def __len__(self): if self.empty: return 0 @@ -178,43 +207,14 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): def __array_function__(self, func, types, args, kwargs): raise NotImplementedError - @property - def tie_indices(self): - return np.concatenate(([0], np.cumsum(self.tie_lengths[:-1]))) - - @property - def empty(self): - return self.tie_values.shape == (0,) - - @property - def ndim(self): - return self.tie_values.ndim - - @property - def shape(self): - return (len(self),) - - @property - def indices(self): - if self.empty: - return np.array([], dtype="int") - else: - return np.arange(len(self)) - - @property - def values(self): - if self.empty: - return np.array([], dtype=self.dtype) - else: - return self.get_value(self.indices) - - @property - def start(self): - return self.tie_values[0] + def issampled(self): + return True - @property - def end(self): - return self.tie_values[-1] + self.sampling_interval * self.tie_lengths[-1] + def get_sampling_interval(self, cast=True): + delta = self.sampling_interval + if cast and np.issubdtype(delta.dtype, np.timedelta64): + delta = delta / np.timedelta64(1, "s") + return delta def equals(self, other): return ( diff --git a/xdas/coordinates/scalar.py b/xdas/coordinates/scalar.py index f0cda938..8a97da47 100644 --- a/xdas/coordinates/scalar.py +++ b/xdas/coordinates/scalar.py @@ -26,9 +26,6 @@ def dim(self, value): if value is not None: raise ValueError("A scalar coordinate cannot have a `dim` other that None") - def get_sampling_interval(self, cast=True): - return None - @staticmethod def isvalid(data): data = np.asarray(data) @@ -37,6 +34,9 @@ def isvalid(data): def isscalar(self): return True + def get_sampling_interval(self, cast=True): + return None + def equals(self, other): if isinstance(other, self.__class__): return self.data == other.data From 5d733d460a56b4f5c338895bdfc17a1019d300a0 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 7 Jan 2026 16:55:39 +0100 Subject: [PATCH 053/104] Fix and test get_indexer. --- tests/coordinates/test_sampled.py | 27 +++++++++++++++++++++ xdas/coordinates/sampled.py | 40 +++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 4 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 9ee05ddd..40042208 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -206,6 +206,33 @@ def test_slice_step_decimate(self): assert decimated.equals(stepped) +class TestSampledCoordinateValueBasedIndexing: + def make_coord(self): + return SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) # two segments: [0, 1, 2] and [10, 11] + + def test_get_indexer_exact(self): + coord = self.make_coord() + assert coord.get_indexer(0.0, method=None) == 0 + assert coord.get_indexer(10.0, method=None) == 3 + with pytest.raises(KeyError): + coord.get_indexer(1.5, method=None) + with pytest.raises(KeyError): + coord.get_indexer(5.0, method=None) + + def test_get_indexer_nearest(self): + coord = self.make_coord() + vals = [0.4, 0.6, 10.4, 10.6, -10.0, 20.0, 6.4, 6.6, 6.5] + expected = [0, 1, 3, 4, 0, 4, 2, 3, 3] + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="nearest") + print(f"Value: {v}, Index: {idx}, Expected: {e}") + assert idx == e + idxs = coord.get_indexer(vals, method="nearest") + assert np.array_equal(idxs, np.array(expected)) + + class TestSampledCoordinateAppendErrors: def test_append_sampling_interval_mismatch(self): coord1 = SampledCoordinate( diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 05d32535..7f4baa61 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -272,26 +272,58 @@ def get_indexer(self, value, method=None): value = np.datetime64(value) else: value = np.asarray(value) - # Check that value lies within the coordinate value range (vectorized) - if np.any(value < self.start) or np.any(value >= self.end): - raise KeyError("index not found") if not is_strictly_increasing( self.tie_values ): # TODO: make it work even in this case raise ValueError("tie_values must be strictly increasing") + + # find preceeding tie point reference = np.searchsorted(self.tie_values, value, side="right") - 1 + reference = np.maximum(reference, 0) + + # overlaps + before = np.maximum(reference - 1, 0) + ends = ( + self.tie_values[before] + self.tie_lengths[before] * self.sampling_interval + ) + if np.any((reference > 0) & (value < ends)): + raise KeyError("value is in an overlap region") + + # gap + after = np.minimum(reference + 1, len(self.tie_values) - 1) + ends = ( + self.tie_values[reference] + + self.tie_lengths[reference] * self.sampling_interval + ) + mask = (reference < len(self.tie_values) - 1) & ( + value - ends >= self.tie_values[after] - value + ) + reference = np.where(mask, after, reference) + offset = (value - self.tie_values[reference]) / self.sampling_interval + match method: case None: - if np.any(offset % 1 != 0): + if np.any( + (offset % 1 != 0) + | (offset < 0) + | (offset >= self.tie_lengths[reference]) + ): raise KeyError("index not found") offset = offset.astype(int) case "nearest": offset = np.round(offset).astype(int) + offset = np.clip(offset, 0, self.tie_lengths[reference] - 1) case "ffill": offset = np.floor(offset).astype(int) + if np.any(offset > self.tie_lengths[reference] - 1): + raise KeyError("index not found") + offset = np.maximum(offset, 0) case "bfill": offset = np.ceil(offset).astype(int) + if np.any(offset < 0): + raise KeyError("index not found") + offset = np.minimum(offset, self.tie_lengths[reference] - 1) case _: raise ValueError( "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" From 98215a405bef7be77aad5f7178ef63cdedc6bffd Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 8 Jan 2026 11:17:55 +0100 Subject: [PATCH 054/104] Fix & Test SampledCoordinate.get_indexer. --- tests/coordinates/test_sampled.py | 36 ++++++++++++++++++++++++++--- xdas/coordinates/sampled.py | 38 ++++++++++++++++++++----------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 40042208..c90dcb3e 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -223,15 +223,45 @@ def test_get_indexer_exact(self): def test_get_indexer_nearest(self): coord = self.make_coord() - vals = [0.4, 0.6, 10.4, 10.6, -10.0, 20.0, 6.4, 6.6, 6.5] - expected = [0, 1, 3, 4, 0, 4, 2, 3, 3] + vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, -10.0, 20.0, 5.9, 6.0, 6.1] + expected = [0, 0, 1, 1, 3, 4, 0, 4, 2, 3, 3] for v, e in zip(vals, expected): idx = coord.get_indexer(v, method="nearest") - print(f"Value: {v}, Index: {idx}, Expected: {e}") assert idx == e + # vectorized idxs = coord.get_indexer(vals, method="nearest") assert np.array_equal(idxs, np.array(expected)) + def test_get_indexer_ffill(self): + coord = self.make_coord() + vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, 20.0, 5.9, 6.0, 6.1] + expected = [0, 0, 0, 1, 3, 3, 4, 2, 2, 2] + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="ffill") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(-10.0, method="ffill") + # vectorized + idxs = coord.get_indexer(vals, method="ffill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([-10.0, 0.0], method="ffill") + + def test_get_indexer_bfill(self): + coord = self.make_coord() + vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, -10.0, 5.9, 6.0, 6.1] + expected = [0, 1, 1, 1, 4, 4, 0, 3, 3, 3] + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="bfill") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(20.0, method="bfill") + # vectorized + idxs = coord.get_indexer(vals, method="bfill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([11.0, 20.0], method="bfill") + class TestSampledCoordinateAppendErrors: def test_append_sampling_interval_mismatch(self): diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 7f4baa61..e55df06f 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -283,22 +283,34 @@ def get_indexer(self, value, method=None): # overlaps before = np.maximum(reference - 1, 0) - ends = ( - self.tie_values[before] + self.tie_lengths[before] * self.sampling_interval + end = ( + self.tie_values[before] + + (self.tie_lengths[before] - 1) * self.sampling_interval ) - if np.any((reference > 0) & (value < ends)): + if np.any((reference > 0) & (value < end)): raise KeyError("value is in an overlap region") # gap after = np.minimum(reference + 1, len(self.tie_values) - 1) - ends = ( + end = ( self.tie_values[reference] - + self.tie_lengths[reference] * self.sampling_interval + + (self.tie_lengths[reference] - 1) * self.sampling_interval ) - mask = (reference < len(self.tie_values) - 1) & ( - value - ends >= self.tie_values[after] - value - ) - reference = np.where(mask, after, reference) + match method: + case "nearest": + mask = (reference < len(self.tie_values) - 1) & ( + value - end >= self.tie_values[after] - value + ) + reference = np.where(mask, after, reference) + case "bfill": + mask = (reference < len(self.tie_values) - 1) & (value >= end) + reference = np.where(mask, after, reference) + case "ffill" | None: + pass + case _: + raise ValueError( + "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" + ) offset = (value - self.tie_values[reference]) / self.sampling_interval @@ -316,14 +328,14 @@ def get_indexer(self, value, method=None): offset = np.clip(offset, 0, self.tie_lengths[reference] - 1) case "ffill": offset = np.floor(offset).astype(int) - if np.any(offset > self.tie_lengths[reference] - 1): + if np.any(offset < 0): raise KeyError("index not found") - offset = np.maximum(offset, 0) + offset = np.minimum(offset, self.tie_lengths[reference] - 1) case "bfill": offset = np.ceil(offset).astype(int) - if np.any(offset < 0): + if np.any(offset > self.tie_lengths[reference] - 1): raise KeyError("index not found") - offset = np.minimum(offset, self.tie_lengths[reference] - 1) + offset = np.maximum(offset, 0) case _: raise ValueError( "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" From 0539b0009443b44b467ab4b5fd83cec3cf24bd6c Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 8 Jan 2026 11:36:03 +0100 Subject: [PATCH 055/104] Add datetime tests to SampledCoordinate.get_indexer. --- tests/coordinates/test_sampled.py | 73 +++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index c90dcb3e..6c770b88 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -212,7 +212,18 @@ def make_coord(self): {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} ) # two segments: [0, 1, 2] and [10, 11] + def make_coord_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + return SampledCoordinate( + { + "tie_values": [t0, t0 + np.timedelta64(10, "s")], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + def test_get_indexer_exact(self): + # float coord = self.make_coord() assert coord.get_indexer(0.0, method=None) == 0 assert coord.get_indexer(10.0, method=None) == 3 @@ -221,10 +232,34 @@ def test_get_indexer_exact(self): with pytest.raises(KeyError): coord.get_indexer(5.0, method=None) + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + assert coord.get_indexer(t0, method=None) == 0 + assert coord.get_indexer(t0 + np.timedelta64(10, "s"), method=None) == 3 + with pytest.raises(KeyError): + coord.get_indexer(t0 + np.timedelta64(1500, "ms"), method=None) + with pytest.raises(KeyError): + coord.get_indexer(t0 + np.timedelta64(5, "s"), method=None) + def test_get_indexer_nearest(self): + # float coord = self.make_coord() vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, -10.0, 20.0, 5.9, 6.0, 6.1] expected = [0, 0, 1, 1, 3, 4, 0, 4, 2, 3, 3] + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="nearest") + assert idx == e + # vectorized + idxs = coord.get_indexer(vals, method="nearest") + assert np.array_equal(idxs, np.array(expected)) + + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + vals = t0 + np.rint(1000 * np.array(vals)).astype("timedelta64[ms]") + # scalar for v, e in zip(vals, expected): idx = coord.get_indexer(v, method="nearest") assert idx == e @@ -233,9 +268,11 @@ def test_get_indexer_nearest(self): assert np.array_equal(idxs, np.array(expected)) def test_get_indexer_ffill(self): + # float coord = self.make_coord() vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, 20.0, 5.9, 6.0, 6.1] expected = [0, 0, 0, 1, 3, 3, 4, 2, 2, 2] + # scalar for v, e in zip(vals, expected): idx = coord.get_indexer(v, method="ffill") assert idx == e @@ -247,10 +284,30 @@ def test_get_indexer_ffill(self): with pytest.raises(KeyError): coord.get_indexer([-10.0, 0.0], method="ffill") + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + vals = t0 + np.rint(1000 * np.array(vals)).astype("timedelta64[ms]") + print(vals) + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="ffill") + print(f"v={v}, idx={idx}, expected={e}") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(t0 - np.timedelta64(10, "s"), method="ffill") + # vectorized + idxs = coord.get_indexer(vals, method="ffill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([t0 - np.timedelta64(10, "s"), t0], method="ffill") + def test_get_indexer_bfill(self): + # float coord = self.make_coord() vals = [0.0, 0.4, 0.6, 1.0, 10.4, 10.6, -10.0, 5.9, 6.0, 6.1] expected = [0, 1, 1, 1, 4, 4, 0, 3, 3, 3] + # scalar for v, e in zip(vals, expected): idx = coord.get_indexer(v, method="bfill") assert idx == e @@ -262,6 +319,22 @@ def test_get_indexer_bfill(self): with pytest.raises(KeyError): coord.get_indexer([11.0, 20.0], method="bfill") + # datetime + coord = self.make_coord_datetime() + t0 = coord[0].values + vals = t0 + np.rint(1000 * np.array(vals)).astype("timedelta64[ms]") + # scalar + for v, e in zip(vals, expected): + idx = coord.get_indexer(v, method="bfill") + assert idx == e + with pytest.raises(KeyError): + coord.get_indexer(t0 + np.timedelta64(20, "s"), method="bfill") + # vectorized + idxs = coord.get_indexer(vals, method="bfill") + assert np.array_equal(idxs, np.array(expected)) + with pytest.raises(KeyError): + coord.get_indexer([t0, t0 + np.timedelta64(20, "s")], method="bfill") + class TestSampledCoordinateAppendErrors: def test_append_sampling_interval_mismatch(self): From 88e4e3d223f6ba4af9c39b2c58b85abf0046e6cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Tue, 13 Jan 2026 11:59:24 +0100 Subject: [PATCH 056/104] create dir before writing to netcdf --- xdas/core/datacollection.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index 4a95a5be..e9feca3d 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -239,6 +239,8 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) + if not os.path.exists(os.path.dirname(fname)): + os.makedirs(os.path.dirname(fname), exist_ok=True) self[key].to_netcdf( fname, mode="a", From dde1be0cee202c3164f9e49f5f056b9379d1f1d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Tue, 13 Jan 2026 15:01:50 +0100 Subject: [PATCH 057/104] Create dirname for dataarray and datacollection, even if dirname is current dir (i.e., empty) --- xdas/core/dataarray.py | 4 ++++ xdas/core/datacollection.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 2dd762a6..7a9ae94b 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -3,6 +3,7 @@ import re import warnings from functools import partial +import os import h5netcdf import h5py @@ -904,6 +905,9 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): attrs = {} if self.attrs is None else self.attrs attrs |= {"coordinate_interpolation": mapping} if mapping else attrs name = "__values__" if self.name is None else self.name + if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): + os.makedirs(os.path.dirname(fname), exist_ok=True) + with h5netcdf.File(fname, mode=mode) as file: if group is not None and group not in file: file.create_group(group) diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index e9feca3d..a2df095f 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -239,7 +239,7 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) - if not os.path.exists(os.path.dirname(fname)): + if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): os.makedirs(os.path.dirname(fname), exist_ok=True) self[key].to_netcdf( fname, From c21746e3ceac1bbda8ad92518a4d5616417a8b8b Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 15:49:10 +0100 Subject: [PATCH 058/104] WIP write and read sampled coordinates. --- tests/coordinates/test_sampled.py | 126 ++++++++++++++++++++++++------ xdas/coordinates/core.py | 2 +- xdas/coordinates/sampled.py | 27 ++++--- 3 files changed, 118 insertions(+), 37 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 6c770b88..125f4a68 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -1,7 +1,10 @@ +import tempfile + import numpy as np import pandas as pd import pytest +import xdas as xd from xdas.coordinates import ( Coordinate, DenseCoordinate, @@ -288,11 +291,9 @@ def test_get_indexer_ffill(self): coord = self.make_coord_datetime() t0 = coord[0].values vals = t0 + np.rint(1000 * np.array(vals)).astype("timedelta64[ms]") - print(vals) # scalar for v, e in zip(vals, expected): idx = coord.get_indexer(v, method="ffill") - print(f"v={v}, idx={idx}, expected={e}") assert idx == e with pytest.raises(KeyError): coord.get_indexer(t0 - np.timedelta64(10, "s"), method="ffill") @@ -336,7 +337,56 @@ def test_get_indexer_bfill(self): coord.get_indexer([t0, t0 + np.timedelta64(20, "s")], method="bfill") -class TestSampledCoordinateAppendErrors: +class TestSampledCoordinateAppend: + def test_append_two_coords(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 1.0} + ) + expected = SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + result = coord1.append(coord2) + assert result.equals(expected) + + def test_append_two_datetime_coords(self): + coord1 = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [3], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + coord2 = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:10")], + "tie_lengths": [2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + expected = SampledCoordinate( + { + "tie_values": [ + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-01T00:00:10"), + ], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + result = coord1.append(coord2) + assert result.equals(expected) + + def test_append_empty(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = SampledCoordinate() + assert coord1.append(coord2).equals(coord1) + assert coord2.append(coord1).equals(coord1) + def test_append_sampling_interval_mismatch(self): coord1 = SampledCoordinate( {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} @@ -432,28 +482,6 @@ def test_slice_full(self): assert sliced.equals(coord) -class TestSampledCoordinateAppend: - def test_append_two_coords(self): - coord1 = SampledCoordinate( - {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} - ) - coord2 = SampledCoordinate( - {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 1.0} - ) - result = coord1.append(coord2) - assert len(result) == 5 - assert result.tie_values[0] == 0.0 - assert result.tie_values[1] == 10.0 - - def test_append_empty(self): - coord1 = SampledCoordinate( - {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} - ) - coord2 = SampledCoordinate() - assert coord1.append(coord2).equals(coord1) - assert coord2.append(coord1).equals(coord1) - - class TestSampledCoordinateDecimate: def test_decimate(self): coord = SampledCoordinate( @@ -601,3 +629,51 @@ def test_non_increasing_tie_values_raises(self): ) with pytest.raises(ValueError): coord.get_indexer(2.0) + + +class TestSampledCoordinateToNetCDF: + def make_dataarray(self): + return xd.DataArray( + np.random.rand(20, 30), + { + "time": { + "tie_values": [ + np.datetime64("2000-01-01T00:00:00.000000000"), + np.datetime64("2000-01-01T00:00:10.000000000"), + ], + "tie_lengths": [5, 15], + "sampling_interval": np.timedelta64(1_000_000_000, "ns").astype( + "timedelta64[ns]" + ), + }, + "distance": { + "tie_values": [0.0], + "tie_lengths": [30], + "sampling_interval": 1.0, + }, + }, + ) + + def test_to_dataset_and_back(self): + import xarray as xr + + da = self.make_dataarray() + dataset = xr.Dataset() + variable_attrs = {} + + # prepare metadata + for coord in da.coords.values(): + dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) + + dataset["data"] = xr.DataArray(attrs=variable_attrs) + coords = xd.Coordinates.from_dataset(dataset, "data") + + assert coords.equals(da.coords) + + def test_to_netcdf_and_back(self): + expected = self.make_dataarray() + + with tempfile.NamedTemporaryFile(suffix=".nc", delete=False) as file: + expected.to_netcdf(file.name) + result = xd.open_dataarray(file.name) + assert result.equals(expected) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 6ee921b4..4154467d 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -214,7 +214,7 @@ def from_dict(cls, dct): @classmethod def from_dataset(cls, dataset, name): - return Coordinate.from_dataset(dataset, name) + return cls(Coordinate.from_dataset(dataset, name)) def copy(self, deep=True): if deep: diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index e55df06f..f49b0bae 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -49,8 +49,9 @@ def __init__(self, data=None, dim=None, dtype=None): if not len(tie_values) == len(tie_lengths): raise ValueError("`tie_values` and `tie_lengths` must have the same length") - # check dtypes + # check dtypes and values if not empty: + # tie_values if not ( np.issubdtype(tie_values.dtype, np.number) or np.issubdtype(tie_values.dtype, np.datetime64) @@ -58,10 +59,14 @@ def __init__(self, data=None, dim=None, dtype=None): raise ValueError( "`tie_values` must have either numeric or datetime dtype" ) + + # tie_lengths if not np.issubdtype(tie_lengths.dtype, np.integer): raise ValueError("`tie_lengths` must be integer-like") if not np.all(tie_lengths > 0): raise ValueError("`tie_lengths` must be strictly positive integers") + + # sampling_interval if not np.isscalar(sampling_interval): raise ValueError("`sampling_interval` must be a scalar value") if np.issubdtype(tie_values.dtype, np.datetime64): @@ -418,7 +423,9 @@ def to_dict(self): return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} def to_dataset(self, dataset, attrs): - mapping = f"{self.name}: {self.name}_values {self.name}_lengths" + mapping = ( + f"{self.name}: {self.name}_values {self.name}_lengths {self.name}_sampling" + ) if "coordinate_sampling" in attrs: attrs["coordinate_sampling"] += " " + mapping else: @@ -430,12 +437,11 @@ def to_dataset(self, dataset, attrs): ) tie_lengths = self.tie_lengths interp_attrs = { - "sampling_interval": self.sampling_interval, - "tie_points_mapping": f"{self.name}_points: {self.name}_values {self.name}_lengths", + "tie_point_mapping": f"{self.name}_points: {self.name}_values {self.name}_lengths", } dataset.update( { - f"{self.name}_sampling": ((), np.nan, interp_attrs), + f"{self.name}_sampling": ((), self.sampling_interval, interp_attrs), f"{self.name}_values": (f"{self.name}_points", tie_values), f"{self.name}_lengths": (f"{self.name}_points", tie_lengths), } @@ -447,14 +453,13 @@ def from_dataset(cls, dataset, name): coords = {} mapping = dataset[name].attrs.pop("coordinate_sampling", None) if mapping is not None: - matches = re.findall(r"(\w+): (\w+) (\w+)", mapping) + matches = re.findall(r"(\w+): (\w+) (\w+) (\w+)", mapping) for match in matches: - dim, values, lengths = match - sampling_interval = ... + dim, values, lengths, sampling = match data = { - "tie_values": dataset[values], - "tie_lengths": dataset[lengths], - "sampling_interval": sampling_interval, + "tie_values": dataset[values].values, + "tie_lengths": dataset[lengths].values, + "sampling_interval": dataset[sampling].values[()], } coords[dim] = Coordinate(data, dim) return coords From 71a14e47e64b53147582092d00aaeb320bb67c38 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 16:14:17 +0100 Subject: [PATCH 059/104] Fix CF conventions for SampledCoordinate. --- xdas/coordinates/sampled.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index f49b0bae..9853a283 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -423,9 +423,7 @@ def to_dict(self): return {"dim": self.dim, "data": data, "dtype": str(self.dtype)} def to_dataset(self, dataset, attrs): - mapping = ( - f"{self.name}: {self.name}_values {self.name}_lengths {self.name}_sampling" - ) + mapping = f"{self.name}: {self.name}_sampling" if "coordinate_sampling" in attrs: attrs["coordinate_sampling"] += " " + mapping else: @@ -437,7 +435,7 @@ def to_dataset(self, dataset, attrs): ) tie_lengths = self.tie_lengths interp_attrs = { - "tie_point_mapping": f"{self.name}_points: {self.name}_values {self.name}_lengths", + "tie_point_mapping": f"{self.dim}: {self.name}_values {self.name}_lengths", } dataset.update( { @@ -453,15 +451,18 @@ def from_dataset(cls, dataset, name): coords = {} mapping = dataset[name].attrs.pop("coordinate_sampling", None) if mapping is not None: - matches = re.findall(r"(\w+): (\w+) (\w+) (\w+)", mapping) + matches = re.findall(r"(\w+): (\w+)", mapping) for match in matches: - dim, values, lengths, sampling = match + name, sampling = match + dim, values, lengths = re.match( + r"(\w+): (\w+) (\w+)", dataset[sampling].attrs["tie_point_mapping"] + ).groups() data = { "tie_values": dataset[values].values, "tie_lengths": dataset[lengths].values, "sampling_interval": dataset[sampling].values[()], } - coords[dim] = Coordinate(data, dim) + coords[name] = Coordinate(data, dim) return coords @classmethod From 999c1d2e59f0f3863648bef1f8f12c12679e15ff Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 17:00:03 +0100 Subject: [PATCH 060/104] Improve SampledCoordinate.simlify testing. --- tests/coordinates/test_sampled.py | 43 ++++++++++++++++++++++++++++--- xdas/coordinates/sampled.py | 3 ++- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 125f4a68..c0f02b71 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -502,9 +502,46 @@ def test_simplify_continuous(self): "sampling_interval": 1.0, } ) - simplified = coord.simplify(tolerance=0.1) - # If continuous (end of first == start of second), should merge - assert len(simplified.tie_values) <= 2 + result = coord.simplify() + expected = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [5], "sampling_interval": 1.0} + ) + assert result.equals(expected) + + def test_simplify_with_tolerance(self): + # Two nearly continuous segments should merge with tolerance + coord = SampledCoordinate( + { + "tie_values": [0.0, 3.1], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + } + ) + result = coord.simplify(tolerance=0.2) + expected = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [5], "sampling_interval": 1.0} + ) + assert result.equals(expected) + # more advanced test + coord = SampledCoordinate( + { + "tie_values": 10 * np.arange(100) + np.random.rand(100) * 0.2 - 0.1, + "tie_lengths": 10 * np.ones(100, dtype=int), + "sampling_interval": 1.0, + } + ) + result = coord.simplify(tolerance=0.2) + assert len(result.tie_values) == 1 + # extra test + coord = SampledCoordinate( + { + "tie_values": 10 * np.arange(100) + np.random.rand(100) * 0.2 - 0.1, + "tie_lengths": 10 * np.ones(100, dtype=int), + "sampling_interval": 1.0, + } + ) + result = coord.simplify(tolerance=0.1) + assert np.all(np.abs(result.values - coord.values) <= 0.1) class TestSampledCoordinateGetIndexer: diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 9853a283..cc02a4ec 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -69,6 +69,7 @@ def __init__(self, data=None, dim=None, dtype=None): # sampling_interval if not np.isscalar(sampling_interval): raise ValueError("`sampling_interval` must be a scalar value") + sampling_interval = np.asarray(sampling_interval)[()] # ensure numpy scalar if np.issubdtype(tie_values.dtype, np.datetime64): if not np.issubdtype( np.asarray(sampling_interval).dtype, np.timedelta64 @@ -378,7 +379,7 @@ def decimate(self, q): def simplify(self, tolerance=None): if tolerance is None: - tolerance = np.array(0, dtype=self.sampling_interval.dtype) + tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] tie_values = [self.tie_values[0]] tie_lengths = [self.tie_lengths[0]] for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): From 210ae4e2f4841f170aaa3219e86f8c506cd4f89f Mon Sep 17 00:00:00 2001 From: Martijn van den Ende Date: Wed, 14 Jan 2026 17:01:04 +0100 Subject: [PATCH 061/104] Implemented ASN ROI logic for distance interpolation --- xdas/io/asn.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/xdas/io/asn.py b/xdas/io/asn.py index 7dc42344..7b41ba4d 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -1,4 +1,5 @@ import json +from bisect import bisect_left import h5py import numpy as np @@ -10,16 +11,45 @@ from ..virtual import VirtualSource -def read(fname): +def read(fname: str) -> DataArray: with h5py.File(fname, "r") as file: header = file["header"] + demod = file["demodSpec"] + t0 = np.datetime64(round(header["time"][()] * 1e9), "ns") dt = np.timedelta64(round(1e9 * header["dt"][()]), "ns") - dx = header["dx"][()] * np.median(np.diff(header["channels"])) + dx = float(header["dx"][()]) # Note: dx before (internal) downsampling! data = VirtualSource(file["data"]) - nt, nx = data.shape + + # Get the optical distance for all the recorded channels (after downsampling) + # Note that this vector is not continuous for more than one ROI + all_dists = file["cableSpec"]["sensorDistances"][...] + + # Buffer for the data index at which each ROI starts/stops + dist_tie_inds = [] + # Buffer for the optical distance at which each ROI starts/stops + dist_tie_vals = [] + + # Loop over ROIs, get the start/stop index before downsampling + for n_start, n_end in zip(demod["roiStart"], demod["roiEnd"]): + # Get the index where the ROI starts based on the position in the + # distance vector. This solves the issue of rounding during decimation + i = bisect_left(all_dists, n_start * dx) + # Append the data index and optical distance to the buffers + dist_tie_inds.append(i) + dist_tie_vals.append(float(all_dists[i])) + + # Repeat the procedure for the index/distance at which the ROI ends. + # A "discontinuity" in the interpolation scheme is created in the + # following way: n_roi = [start, stop-1, stop, start, stop-1, stop, ...] + i = bisect_left(all_dists, n_end * dx) + for j in reversed(range(2)): + dist_tie_inds.append(i-j) + dist_tie_vals.append(float(all_dists[i-j])) + + nt = data.shape[0] time = {"tie_indices": [0, nt - 1], "tie_values": [t0, t0 + (nt - 1) * dt]} - distance = {"tie_indices": [0, nx - 1], "tie_values": [0.0, (nx - 1) * dx]} + distance = {"tie_indices": dist_tie_inds, "tie_values": dist_tie_vals} return DataArray(data, {"time": time, "distance": distance}) From 795c251a2fd3aca61ca86f3db774e470533fddf0 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 18:11:30 +0100 Subject: [PATCH 062/104] Acheive 100% test coverage for sampled. --- tests/coordinates/test_sampled.py | 190 +++++++++++++++++++++++++++++- xdas/coordinates/sampled.py | 10 +- 2 files changed, 190 insertions(+), 10 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index c0f02b71..82d083d5 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -36,6 +36,7 @@ def test_init_and_empty(self): assert empty.shape == (0,) assert empty.ndim == 1 assert empty.values.size == 0 + assert empty.indices.size == 0 def test_init_validation_numeric(self): # valid numeric @@ -45,6 +46,8 @@ def test_init_validation_numeric(self): assert len(coord) == 3 assert coord.start == 0.0 assert coord.end == 3.0 + assert coord.issampled() + coord.get_sampling_interval() == 1.0 # mismatched lengths with pytest.raises(ValueError): @@ -71,6 +74,12 @@ def test_init_validation_numeric(self): {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": [1.0]} ) + # non-numeric tie_values + with pytest.raises(ValueError): + SampledCoordinate( + {"tie_values": ["a"], "tie_lengths": [3], "sampling_interval": 1.0} + ) + def test_init_validation_datetime(self): # valid datetime with timedelta sampling interval t0 = np.datetime64("2000-01-01T00:00:00") @@ -83,6 +92,8 @@ def test_init_validation_datetime(self): ) assert coord.start == t0 assert coord.end == t0 + np.timedelta64(2, "s") + assert coord.get_sampling_interval() == 1 + assert coord.get_sampling_interval(cast=False) == np.timedelta64(1, "s") # invalid: datetime with numeric sampling interval with pytest.raises(ValueError): @@ -90,6 +101,34 @@ def test_init_validation_datetime(self): {"tie_values": [t0], "tie_lengths": [2], "sampling_interval": 1} ) + def test_invalid_data(self): + # lack of required keys + with pytest.raises(ValueError): + SampledCoordinate({"tie_values": [0.0], "tie_lengths": [3]}) + with pytest.raises(ValueError): + SampledCoordinate({"tie_lengths": [3], "sampling_interval": 1.0}) + with pytest.raises(ValueError): + SampledCoordinate({"tie_values": [0.0], "sampling_interval": 1.0}) + + def test_invalid_shapes(self): + # tie_values and tie_lengths must be 1D + with pytest.raises(ValueError): + SampledCoordinate( + { + "tie_values": [[0.0, 10.0]], + "tie_lengths": [3, 2], + "sampling_interval": 1.0, + } + ) + with pytest.raises(ValueError): + SampledCoordinate( + { + "tie_values": [0.0, 10.0], + "tie_lengths": [[3], [2]], + "sampling_interval": 1.0, + } + ) + class TestSampledCoordinateIndexing: def make_coord(self): @@ -133,6 +172,13 @@ def test_get_value_scalar_and_vector(self): with pytest.raises(IndexError): coord.get_value([-6, 0]) + def test_values(self): + coord = self.make_coord() + expected = np.array([0.0, 1.0, 2.0, 10.0, 11.0]) + assert np.array_equal(coord.values, expected) + assert np.array_equal(coord.__array__(), expected) + assert np.array_equal(coord.__array__(dtype=expected.dtype), expected) + def test_getitem(self): coord = self.make_coord() # scalar -> ScalarCoordinate @@ -176,11 +222,36 @@ def test_getitem(self): arr = coord[[0, 4]] assert isinstance(arr, DenseCoordinate) assert np.array_equal(arr.values, np.array([0.0, 11.0])) + # negative step is not implemented yet + with pytest.raises(NotImplementedError): + coord[4:0:-1] def test_repr(self): - # Just ensure it returns a string - coord = self.make_coord() - assert isinstance(repr(coord), str) + # floating coord + floating = self.make_coord() + assert isinstance(repr(floating), str) + # integer coord + integer = SampledCoordinate( + {"tie_values": [0], "tie_lengths": [3], "sampling_interval": 1} + ) + assert isinstance(repr(integer), str) + # empty coord + empty = SampledCoordinate() + assert repr(empty) == "empty coordinate" + # singleton + singleton = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [1], "sampling_interval": 1.0} + ) + assert isinstance(repr(singleton), str) + # numeric coord + datetime = SampledCoordinate( + { + "tie_values": [np.datetime64("2000-01-01T00:00:00")], + "tie_lengths": [3], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + assert isinstance(repr(datetime), str) class TestSampledCoordinateSliceEdgeCases: @@ -336,6 +407,31 @@ def test_get_indexer_bfill(self): with pytest.raises(KeyError): coord.get_indexer([t0, t0 + np.timedelta64(20, "s")], method="bfill") + def test_get_indexer_overlap(self): + coord = SampledCoordinate( + {"tie_values": [0.0, 2.0], "tie_lengths": [3, 3], "sampling_interval": 1.0} + ) # segments: [0,1,2] and [2,3,4] + assert coord.get_indexer(1.0) == 1 + assert coord.get_indexer(3.0) == 4 + with pytest.raises(KeyError): + coord.get_indexer(2.0) + coord = SampledCoordinate( + {"tie_values": [0.0, 2.0], "tie_lengths": [5, 5], "sampling_interval": 1.0} + ) # segments: [0,1,2,3,4] and [2,3,4,5,6] + assert coord.get_indexer(1.0) == 1 + assert coord.get_indexer(6.0) == 9 + with pytest.raises(KeyError): + coord.get_indexer(2.0) + with pytest.raises(KeyError): + coord.get_indexer(2.5, method="nearest") + with pytest.raises(KeyError): + coord.get_indexer(4.0) + + def test_get_indexer_invalid_method(self): + coord = self.make_coord() + with pytest.raises(ValueError): + coord.get_indexer(0.0, method="invalid") + class TestSampledCoordinateAppend: def test_append_two_coords(self): @@ -411,6 +507,26 @@ def test_append_dtype_mismatch(self): with pytest.raises(ValueError): coord1.append(coord2) + def test_append_type_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + coord2 = DenseCoordinate(np.array([10.0, 11.0])) + with pytest.raises(TypeError): + coord1.append(coord2) + + def test_append_dimension_mismatch(self): + coord1 = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0}, + dim="time", + ) + coord2 = SampledCoordinate( + {"tie_values": [10.0], "tie_lengths": [2], "sampling_interval": 1.0}, + dim="depth", + ) + with pytest.raises(ValueError): + coord1.append(coord2) + class TestSampledCoordinateDiscontinuitiesAvailabilities: def test_discontinuities_and_availabilities(self): @@ -454,6 +570,25 @@ def test_to_dict_contains_expected_keys(self): "sampling_interval", } + def test_to_dict_with_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + coord = SampledCoordinate( + { + "tie_values": [t0, t0 + np.timedelta64(10, "s")], + "tie_lengths": [3, 2], + "sampling_interval": np.timedelta64(1, "s"), + }, + dim="time", + ) + d = coord.to_dict() + assert "dim" in d + assert "data" in d + assert set(d["data"].keys()) >= { + "tie_values", + "tie_lengths", + "sampling_interval", + } + class TestSampledCoordinateSlicing: def make_coord(self): @@ -644,6 +779,11 @@ def test_get_indexer_datetime_methods(self): coord.get_indexer(np.datetime64("1999-12-31T23:59:59")) with pytest.raises(KeyError): coord.get_indexer(np.datetime64("2000-01-01T00:00:12")) + # string input + assert coord.get_indexer("2000-01-01T00:00:01.500", method="nearest") in [1, 2] + # invalid method + with pytest.raises(ValueError): + coord.get_indexer(t, method="bad") def test_start_end_properties_datetime(self): coord = self.make_dt_coord() @@ -714,3 +854,47 @@ def test_to_netcdf_and_back(self): expected.to_netcdf(file.name) result = xd.open_dataarray(file.name) assert result.equals(expected) + + +class TestGetSplitIndices: + def test_get_split_indices_no_tolerance(self): + coord = SampledCoordinate( + {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} + ) + div_points = coord.get_split_indices() + expected = np.array([3]) # indices where segments end + assert np.array_equal(div_points, expected) + + def test_get_split_indices_with_tolerance(self): + coord = SampledCoordinate( + { + "tie_values": [0.0, 3.1, 10.0], + "tie_lengths": [3, 2, 2], + "sampling_interval": 1.0, + } + ) + div_points = coord.get_split_indices(tolerance=0.2) + expected = np.array([5]) # only the second gap exceeds tolerance + assert np.array_equal(div_points, expected) + + +class TestFromBlock: + def test_from_block(self): + result = SampledCoordinate.from_block(start=0.0, size=5, step=1.0) + expected = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [5], "sampling_interval": 1.0} + ) + assert result.equals(expected) + + +class TestNotImplementedMethods: + def test_raises(self): + coord = SampledCoordinate( + {"tie_values": [0.0], "tie_lengths": [3], "sampling_interval": 1.0} + ) + with pytest.raises(NotImplementedError): + coord.__array_ufunc__(None, None) + with pytest.raises(NotImplementedError): + coord.__array_function__(None, None, None, None) + with pytest.raises(NotImplementedError): + coord.from_array(None) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index cc02a4ec..7afd7cde 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -33,7 +33,7 @@ def __init__(self, data=None, dim=None, dtype=None): # parse data data, dim = parse(data, dim) if not self.__class__.isvalid(data): - raise TypeError( + raise ValueError( "`data` must be dict-like and contain `tie_values`, `tie_lengths`, and " "`sampling_interval`" ) @@ -293,7 +293,7 @@ def get_indexer(self, value, method=None): self.tie_values[before] + (self.tie_lengths[before] - 1) * self.sampling_interval ) - if np.any((reference > 0) & (value < end)): + if np.any((reference > 0) & (value <= end)): raise KeyError("value is in an overlap region") # gap @@ -342,10 +342,6 @@ def get_indexer(self, value, method=None): if np.any(offset > self.tie_lengths[reference] - 1): raise KeyError("index not found") offset = np.maximum(offset, 0) - case _: - raise ValueError( - "method must be one of `None`, 'nearest', 'ffill', or 'bfill'" - ) return self.tie_indices[reference] + offset def append(self, other): @@ -404,7 +400,7 @@ def get_split_indices(self, tolerance=None): deltas = self.tie_values[1:] - ( self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] ) - indices = indices[np.abs(deltas) <= tolerance] + indices = indices[np.abs(deltas) > tolerance] return indices @classmethod From a1bd76a42790d6e4b374d731c55ec0669a4e7d4d Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 18:58:27 +0100 Subject: [PATCH 063/104] Fix xdas.io.__init__.py --- xdas/io/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xdas/io/__init__.py b/xdas/io/__init__.py index 926a1a90..54f6f22e 100644 --- a/xdas/io/__init__.py +++ b/xdas/io/__init__.py @@ -1 +1,2 @@ +from . import apsensing, asn, febus, miniseed, optasense, silixa, sintela, terra15 from .core import get_free_port From de02a6c2b39f6b976d4841ddf9d32eb47d114010 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 18:58:48 +0100 Subject: [PATCH 064/104] Remove unintentionally added file --- .coverage | Bin 53248 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .coverage diff --git a/.coverage b/.coverage deleted file mode 100644 index 1f297399c8e31876e1f07878b2cc2ed5d864599b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 53248 zcmeI)Piz}i9tZIG=W(6Hd4;O7vZ2a56w-(!CLlnNiMQS}zQx zb0Z__g|`E%SQUY1m4p|~vnO7hpF1%!v;c2#iO zDcdD0u$>AI7DbY};fu1o(9t%!)X9X?VSc$(u*;My5EUw-?%6fVTj6KL%4C=zK3X7_ zgJ=U4BJ7GIQ}APP_6hEZ1>p&&B>X6fV|ICBVB+28KstBmka|&sb+Wt?{SEfihBkDn za(aOZD!E=+Mql(Sr?e>iNp5Y|u;fyl%Ry9I&#mPHmsf2ktfu4Jf$ch6EQ?Yj5ak^! zI4RMRiSW8>LuI`oEA&Q`Gd@AQaTC@~MtNw;9LDT4YDFS{vr((d zlS4V80|)MIId{t&xN&8F_PN>NTz2Z)b2<}pc}AmL4aYv$5Zfnzl6P`WhShM~NiHiv zE#TfkGxyjbRiRcXmi=>8dc9S!8i5P)NS;E`4*czf zOi!cP-n>iGk^6UVH?o)XHWJy3s#WeTgk47F=F!Qa*4*7^@(G6J$&SX|I3Y#)i?%iy zEXBZ!RkzsN*1{I!3+f;=79`0Q8cs=e6<^k&1;QF!p!#KKx@QI=SD}>3z@JP#b+yiO7}FO>nFOG!zH5&BgL+=S$Zs~;lj98gEZ&hhDsav*3 zU6#h+=nBAP!m>q)t)klq!uBgy!{#ZkJ)L-?0y^u7J*AomlT-J8-t~Q(F01mgQN%FGnGNoEKfUDl8|uHc*b!J{{Bcc^Jd(SEZ*qW%w(yjLP}(Fs9_<&@*=mAni`rPA0hJEuHOG>A|}K2Dv|r!IO+nMtAXUO8eVzrA2XDanaT2q!oz3t_d!agu#RjYsvC?$rH4buB8*>`cE|jdA15OaxU+Z56G8t ztuZy7yZ?T*)rdP|G|m)!x~SKzBuDsn?Ci9c=V-vCS3BM9MA~(6$kpfwa&U{z zU@=liQpK-z1}5YUzRETf`ojbP2tWV=5P$##AOHafKmY;|fWVz6psSjimiPY}`&?n` zG{Xb|2tWV=5P$##AOHafKmY;|fWZC~7}E43Ecq9Y2bijl=Hh<>cy#)~>4yiXRvP{300Izz00bZa0SG_<0uWG?Q$~xa zd$>#Oz5h4ojaIh5Vs^OyPd#I_M*1tI>z29q{@-{)Z}wMI_x-0C|6g@wZ`u*BlTTe9G&1SQ_b#-g@+Iq9{kAqo*_T6JRTkZC0 zyRE;br_@aA5q;|eS&l{zgMXxacjM}-mp;9IeQLJ-Lnc1JkfQwk=+SpnozGp#ynp$> zGpmhM{jc{fEiEWKrX5p_OyH^1Ioe{1ctmpA{^e(G-<*V^ruzFcp%Kb#G}|Ia>A*arI_yTSg=R%rzj z1Rwwb2tWV=5P$##AOHafKmY>!LO{MTp_<|Q6KX0It{F!7r0e06rWy1eLHPNt0uX=z1Rwwb2tWV=5P$##Ah1UP`T0NY|M!T8Y!H9|1Rwwb2tWV=5P$##AOL}V zA`pH5{|ov5|KGE3**EMz?2CO8I>Lki1Rwwb2tWV=5P$##AOHafKww)yH|b~o^^}|$ Oa;D3fCTH^Z|9=E3Y{Ys1 From a2eb4a1160b81358fd936c5860281a81b33a9714 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 19:32:32 +0100 Subject: [PATCH 065/104] allow per dim ctype for coordinates. --- xdas/io/apsensing.py | 8 +++++--- xdas/io/asn.py | 12 +++++++----- xdas/io/core.py | 23 +++++++++++++++++++++++ xdas/io/febus.py | 8 +++++--- xdas/io/optasense.py | 6 ++++-- xdas/io/silixa.py | 6 ++++-- xdas/io/sintela.py | 6 ++++-- xdas/io/terra15.py | 8 +++++--- 8 files changed, 57 insertions(+), 20 deletions(-) diff --git a/xdas/io/apsensing.py b/xdas/io/apsensing.py index 937b165e..fca5089d 100644 --- a/xdas/io/apsensing.py +++ b/xdas/io/apsensing.py @@ -4,9 +4,11 @@ from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, ctype="interpolated"): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: t0 = file["Metadata"]["Timestamp"][()].item().decode() fs = file["DAQ"]["RepetitionFrequency"][()].item() @@ -19,6 +21,6 @@ def read(fname, ctype="interpolated"): t0 = np.datetime64(t0) dt = np.timedelta64(round(1e9 / fs), "ns") nt, nd = data.shape - time = Coordinate[ctype].from_block(t0, nt, dt, dim="time") - distance = Coordinate[ctype].from_block(0.0, nd, dx, dim="distance") + time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype["distance"]].from_block(0.0, nd, dx, dim="distance") return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/io/asn.py b/xdas/io/asn.py index 3e935364..5ed5bb52 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -7,9 +7,11 @@ from ..coordinates.core import Coordinate, get_sampling_interval from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, ctype="interpolated"): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: header = file["header"] t0 = np.datetime64(round(header["time"][()] * 1e9), "ns") @@ -17,8 +19,8 @@ def read(fname, ctype="interpolated"): dx = header["dx"][()] * np.median(np.diff(header["channels"])) data = VirtualSource(file["data"]) nt, nx = data.shape - time = Coordinate[ctype].from_block(t0, nt, dt, dim="time") - distance = Coordinate[ctype].from_block(0.0, nx, dx, dim="distance") + time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype["distance"]].from_block(0.0, nx, dx, dim="distance") return DataArray(data, {"time": time, "distance": distance}) @@ -105,7 +107,7 @@ def _update_header(self, message): roiTable = header["roiTable"][0] di = (roiTable["roiStart"] // roiTable["roiDec"]) * header["dx"] de = (roiTable["roiEnd"] // roiTable["roiDec"]) * header["dx"] - self.distance = { + self.distance = { # TODO: use from_block "tie_indices": [0, header["nChannels"] - 1], "tie_values": [di, de], } @@ -114,7 +116,7 @@ def _update_header(self, message): def _unpack(self, message): t0 = np.frombuffer(message[:8], "datetime64[ns]").reshape(()) data = np.frombuffer(message[8:], self.dtype).reshape(self.shape) - time = { + time = { # TODO: use from_block "tie_indices": [0, self.shape[0] - 1], "tie_values": [t0, t0 + (self.shape[0] - 1) * self.delta], } diff --git a/xdas/io/core.py b/xdas/io/core.py index a4171a76..806740af 100644 --- a/xdas/io/core.py +++ b/xdas/io/core.py @@ -18,3 +18,26 @@ def get_free_port(): with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: s.bind(("", 0)) return s.getsockname()[1] + + +def parse_ctype(ctype): + if ctype is None: + ctype = { + "time": "interpolated", + "distance": "interpolated", + } + elif isinstance(ctype, str): + ctype = { + "time": ctype, + "distance": ctype, + } + elif isinstance(ctype, dict): + ctype = { + "time": ctype.get("time", "interpolated"), + "distance": ctype.get("distance", "interpolated"), + } + else: + raise ValueError( + "ctype must be None, str, or dict with 'time' and/or 'distance' keys" + ) + return ctype diff --git a/xdas/io/febus.py b/xdas/io/febus.py index 747b174e..e1c2f84d 100644 --- a/xdas/io/febus.py +++ b/xdas/io/febus.py @@ -7,9 +7,10 @@ from ..core.dataarray import DataArray from ..core.routines import concatenate from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, overlaps=None, offset=None, ctype="interpolated"): +def read(fname, overlaps=None, offset=None, ctype=None): """ Open a Febus file into a xdas DataArray object. @@ -41,6 +42,7 @@ def read(fname, overlaps=None, offset=None, ctype="interpolated"): A data array containing the data from the Febus file. """ + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: (device_name,) = list(file.keys()) source = file[device_name]["Source1"] @@ -94,8 +96,8 @@ def read(fname, overlaps=None, offset=None, ctype="interpolated"): for t0, chunk in zip(times, chunks): t0 = np.rint(1e6 * t0).astype("M8[us]").astype("M8[ns]") - time = Coordinate[ctype].from_block(t0, nt, dt, dim="time") - distance = Coordinate[ctype].from_block(0.0, nx, dx, dim="distance") + time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") + distance = Coordinate[ctype["distance"]].from_block(0.0, nx, dx, dim="distance") da = DataArray(chunk, {"time": time, "distance": distance}, name=name) dc.append(da) diff --git a/xdas/io/optasense.py b/xdas/io/optasense.py index 16e175cf..f4f34bf7 100644 --- a/xdas/io/optasense.py +++ b/xdas/io/optasense.py @@ -4,9 +4,11 @@ from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, ctype="interpolated"): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: acquisition = file["Acquisition"] dx = acquisition.attrs["SpatialSamplingInterval"] @@ -19,5 +21,5 @@ def read(fname, ctype="interpolated"): "tie_indices": [0, nt - 1], "tie_values": [tstart, tend], } # TODO: use from_block - distance = Coordinate[ctype].from_block(0.0, nd, dx, dim="distance") + distance = Coordinate[ctype["distance"]].from_block(0.0, nd, dx, dim="distance") return DataArray(data, {"distance": distance, "time": time}) diff --git a/xdas/io/silixa.py b/xdas/io/silixa.py index ae97867e..2c64e688 100644 --- a/xdas/io/silixa.py +++ b/xdas/io/silixa.py @@ -3,10 +3,12 @@ from ..coordinates.core import Coordinate from ..core.dataarray import DataArray +from .core import parse_ctype from .tdms import TdmsReader -def read(fname, ctype="interpolated"): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) shape, dtype, coords = read_header(fname, ctype) data = dask.array.from_delayed(dask.delayed(read_data)(fname), shape, dtype) return DataArray(data, coords) @@ -19,7 +21,7 @@ def read_header(fname, ctype): dtype = tdms._data_type t0 = np.datetime64(props["GPSTimeStamp"]) dt = np.timedelta64(round(1e9 / props["SamplingFrequency[Hz]"]), "ns") - time = Coordinate[ctype].from_block(t0, shape[0], dt, dim="time") + time = Coordinate[ctype["time"]].from_block(t0, shape[0], dt, dim="time") distance = { "tie_indices": [0, shape[1] - 1], "tie_values": [props["Start Distance (m)"], props["Stop Distance (m)"]], diff --git a/xdas/io/sintela.py b/xdas/io/sintela.py index 605f50fd..60902824 100644 --- a/xdas/io/sintela.py +++ b/xdas/io/sintela.py @@ -4,9 +4,11 @@ from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, ctype="interpolated"): +def read(fname, ctype=None): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: acquisition = file["Acquisition"] dx = acquisition.attrs["SpatialSamplingInterval"] @@ -19,5 +21,5 @@ def read(fname, ctype="interpolated"): "tie_indices": [0, nt - 1], "tie_values": [tstart, tend], } # TODO: use from_block - distance = Coordinate[ctype].from_block(0.0, nd, dx, dim="distance") + distance = Coordinate[ctype["distance"]].from_block(0.0, nd, dx, dim="distance") return DataArray(data, {"time": time, "distance": distance}) diff --git a/xdas/io/terra15.py b/xdas/io/terra15.py index e6cf2426..e9a539e4 100644 --- a/xdas/io/terra15.py +++ b/xdas/io/terra15.py @@ -6,9 +6,11 @@ from ..coordinates.core import Coordinate from ..core.dataarray import DataArray from ..virtual import VirtualSource +from .core import parse_ctype -def read(fname, tz=timezone.utc, ctype="interpolated"): +def read(fname, ctype=None, tz=timezone.utc): + ctype = parse_ctype(ctype) with h5py.File(fname, "r") as file: ti = np.datetime64( datetime.fromtimestamp(file["data_product"]["gps_time"][0], tz=tz) @@ -21,5 +23,5 @@ def read(fname, tz=timezone.utc, ctype="interpolated"): data = VirtualSource(file["data_product"]["data"]) nt, nd = data.shape time = {"tie_indices": [0, nt - 1], "tie_values": [ti, tf]} # TODO: use from_block - ctype = Coordinate[ctype].from_block(d0, nd, dx, dim="distance") - return DataArray(data, {"time": time, "distance": ctype}) + distance = Coordinate[ctype["distance"]].from_block(d0, nd, dx, dim="distance") + return DataArray(data, {"time": time, "distance": distance}) From 0ee8f3150b03855d82b584d64d24facdc12b7b22 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 19:35:39 +0100 Subject: [PATCH 066/104] Add python 3.14 for testing in github actions. --- .github/workflows/code-coverage.yaml | 2 +- .github/workflows/tests.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/code-coverage.yaml b/.github/workflows/code-coverage.yaml index 0d31cd94..1451a479 100644 --- a/.github/workflows/code-coverage.yaml +++ b/.github/workflows/code-coverage.yaml @@ -13,7 +13,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: 3.13 + python-version: 3.14 - name: Install dependencies run: | diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c3e631fa..7a4e350c 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -8,7 +8,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.11", "3.12", "3.13", "3.14"] steps: - uses: actions/checkout@v4 From 2ed500c86693e0f25e96e9ce25232a443b7ed37e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 21:16:41 +0100 Subject: [PATCH 067/104] Fix sampling interval handling for older xarray versions --- xdas/coordinates/sampled.py | 14 ++++++++++++++ xdas/core/dataarray.py | 9 ++++++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 7afd7cde..0a8d8b51 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -459,6 +459,20 @@ def from_dataset(cls, dataset, name): "tie_lengths": dataset[lengths].values, "sampling_interval": dataset[sampling].values[()], } + + # TODO: remove when dropping support for python 3.10 + import xarray + + if ( + xarray.__version__ < "2025.7" + and "dtype" in dataset[sampling].attrs + and "units" in dataset[sampling].attrs + ): + data["sampling_interval"] = np.array( + data["sampling_interval"], + dtype=dataset[sampling].attrs["dtype"], + )[()] + coords[name] = Coordinate(data, dim) return coords diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 468aa4d6..d769493d 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -943,7 +943,14 @@ def from_netcdf(cls, fname, group=None): The openend data array. """ # read metadata - with xr.open_dataset(fname, group=group, engine="h5netcdf") as dataset: + with xr.open_dataset( + fname, + group=group, + engine="h5netcdf", + decode_timedelta=( + xr.__version__ >= "2025.7" + ), # TODO: remove when dropping support for python 3.10 + ) as dataset: # check file format if not ( "Conventions" in dataset.attrs and "CF" in dataset.attrs["Conventions"] From b1e3dface4c1757cd6bed5da0c121791c5ba8aaf Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Wed, 14 Jan 2026 21:45:19 +0100 Subject: [PATCH 068/104] Manual timedelta handling. --- xdas/coordinates/sampled.py | 36 +++++++++++++++++++++++++++--------- xdas/core/dataarray.py | 7 +------ 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 0a8d8b51..a7a37e00 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -5,6 +5,17 @@ from .core import Coordinate, format_datetime, is_strictly_increasing, parse +CODE_TO_UNITS = { + "h": "hours", + "m": "minutes", + "s": "seconds", + "ms": "milliseconds", + "us": "microseconds", + "ns": "nanoseconds", +} +UNITS_TO_CODE = {v: k for k, v in CODE_TO_UNITS.items()} + + class SampledCoordinate(Coordinate, name="sampled"): """ A coordinate that is sampled at regular intervals. @@ -434,9 +445,19 @@ def to_dataset(self, dataset, attrs): interp_attrs = { "tie_point_mapping": f"{self.dim}: {self.name}_values {self.name}_lengths", } + + # timedelta + if np.issubdtype(self.sampling_interval.dtype, np.timedelta64): + code, count = np.datetime_data(self.sampling_interval.dtype) + interp_attrs["dtype"] = "timedelta64[ns]" + interp_attrs["units"] = CODE_TO_UNITS[code] + sampling_interval = count * self.sampling_interval.astype(int) + else: + sampling_interval = self.sampling_interval + dataset.update( { - f"{self.name}_sampling": ((), self.sampling_interval, interp_attrs), + f"{self.name}_sampling": ((), sampling_interval, interp_attrs), f"{self.name}_values": (f"{self.name}_points", tie_values), f"{self.name}_lengths": (f"{self.name}_points", tie_lengths), } @@ -460,18 +481,15 @@ def from_dataset(cls, dataset, name): "sampling_interval": dataset[sampling].values[()], } - # TODO: remove when dropping support for python 3.10 - import xarray - + # timedelta if ( - xarray.__version__ < "2025.7" - and "dtype" in dataset[sampling].attrs + "dtype" in dataset[sampling].attrs and "units" in dataset[sampling].attrs ): - data["sampling_interval"] = np.array( + data["sampling_interval"] = np.timedelta64( data["sampling_interval"], - dtype=dataset[sampling].attrs["dtype"], - )[()] + UNITS_TO_CODE[dataset[sampling].attrs.pop("units")], + ).astype(dataset[sampling].attrs.pop("dtype")) coords[name] = Coordinate(data, dim) return coords diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index d769493d..c8efebcd 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -944,12 +944,7 @@ def from_netcdf(cls, fname, group=None): """ # read metadata with xr.open_dataset( - fname, - group=group, - engine="h5netcdf", - decode_timedelta=( - xr.__version__ >= "2025.7" - ), # TODO: remove when dropping support for python 3.10 + fname, group=group, engine="h5netcdf", decode_timedelta=False ) as dataset: # check file format if not ( From 24e4afab9cf11e59d9f7e918fbdbc8e47c5302f5 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 18:33:53 +0100 Subject: [PATCH 069/104] Add release notes for SampleCoordinate feature --- docs/release-notes.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/release-notes.md b/docs/release-notes.md index 0dd5b92a..d52e3f06 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,5 +1,8 @@ # Release notes +## 0.X.X +- Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). + ## 0.2.4 - Add StreamWriter to write long time series to miniSEED (@marbail). - Fix OptaSense engine wrong axis attribution (@smouellet). From 3f24463926f1ce64f2d4f6c1fb1436492430944d Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 18:34:48 +0100 Subject: [PATCH 070/104] Fix xdas.__init__.py. --- xdas/__init__.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/xdas/__init__.py b/xdas/__init__.py index 44220eb0..9326541c 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -1,8 +1,21 @@ +from . import ( + atoms, + config, + coordinates, + fft, + io, + parallel, + processing, + signal, + synthetics, + virtual, +) from .coordinates import ( Coordinate, Coordinates, get_sampling_interval, ) +from .core import dataarray, datacollection, methods, numpy, routines from .core.dataarray import DataArray from .core.datacollection import DataCollection, DataMapping, DataSequence from .core.methods import * From d85b6e4921eeed8cfe321836f1fb4e2374761acc Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 18:35:06 +0100 Subject: [PATCH 071/104] Remove unnecessary blank line in sampled.py --- xdas/coordinates/sampled.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index a7a37e00..34485eaf 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -4,7 +4,6 @@ from .core import Coordinate, format_datetime, is_strictly_increasing, parse - CODE_TO_UNITS = { "h": "hours", "m": "minutes", From 035fd83d6c820ae085486bcf679ebc8f02774376 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 18:35:49 +0100 Subject: [PATCH 072/104] Quick update of docs API. --- docs/api/coordinates.md | 216 ++++++++++++++++++++++++++++++++++++++++ docs/api/index.md | 4 +- docs/api/xdas.md | 158 ----------------------------- 3 files changed, 219 insertions(+), 159 deletions(-) create mode 100644 docs/api/coordinates.md diff --git a/docs/api/coordinates.md b/docs/api/coordinates.md new file mode 100644 index 00000000..e94b1570 --- /dev/null +++ b/docs/api/coordinates.md @@ -0,0 +1,216 @@ +```{eval-rst} +.. currentmodule:: xdas.coordinates +``` +# xdas.coordinates + +## Coordinates + +Constructor + + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinates +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinates.isdim + Coordinates.get_query + Coordinates.to_index + Coordinates.equals + Coordinates.to_dict + Coordinates.copy + Coordinates.drop_dims + Coordinates.drop_coords +``` + +### Coordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinate +``` + +Attributes + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinate.dtype + Coordinate.ndim + Coordinate.shape + Coordinate.values +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + Coordinate.to_index + Coordinate.isscalar + Coordinate.isdense + Coordinate.isinterp +``` + + +### ScalarCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + ScalarCoordinate +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + ScalarCoordinate.isvalid + ScalarCoordinate.equals + ScalarCoordinate.to_index + ScalarCoordinate.to_dict +``` + +### DenseCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DenseCoordinate +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + DenseCoordinate.isvalid + DenseCoordinate.index + DenseCoordinate.get_indexer + DenseCoordinate.slice_indexer + DenseCoordinate.to_dict +``` + +### InterpCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + InterpCoordinate +``` + +Attributes + + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + InterpCoordinate.tie_indices + InterpCoordinate.tie_values + InterpCoordinate.empty + InterpCoordinate.dtype + InterpCoordinate.ndim + InterpCoordinate.shape + InterpCoordinate.indices + InterpCoordinate.values +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + InterpCoordinate.isvalid + InterpCoordinate.equals + InterpCoordinate.get_value + InterpCoordinate.format_index + InterpCoordinate.slice_index + InterpCoordinate.get_indexer + InterpCoordinate.slice_indexer + InterpCoordinate.decimate + InterpCoordinate.simplify + InterpCoordinate.get_discontinuities + InterpCoordinate.from_array + InterpCoordinate.to_dict +``` + + +### SampledCoordinate + +Constructor + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + SampledCoordinate +``` + +Attributes + + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + SampledCoordinate.tie_values + SampledCoordinate.tie_lengths + SampledCoordinate.tie_indices + SampledCoordinate.sampling_interval + SampledCoordinate.empty + SampledCoordinate.dtype + SampledCoordinate.ndim + SampledCoordinate.shape + SampledCoordinate.indices + SampledCoordinate.values +``` + +Methods + +```{eval-rst} +.. autosummary:: + :toctree: ../_autosummary + + SampledCoordinate.isvalid + SampledCoordinate.equals + SampledCoordinate.get_sampling_interval + SampledCoordinate.get_value + SampledCoordinate.slice_index + SampledCoordinate.get_indexer + SampledCoordinate.slice_indexer + SampledCoordinate.append + SampledCoordinate.decimate + SampledCoordinate.simplify + SampledCoordinate.get_split_indices + SampledCoordinate.from_array + SampledCoordinate.to_dict + SampledCoordinate.from_block +``` \ No newline at end of file diff --git a/docs/api/index.md b/docs/api/index.md index 3f2d665b..d16f304d 100644 --- a/docs/api/index.md +++ b/docs/api/index.md @@ -5,9 +5,11 @@ xdas atoms -io +coordinates fft +io parallel +picking processing signal synthetics diff --git a/docs/api/xdas.md b/docs/api/xdas.md index 118b079b..be198ce9 100644 --- a/docs/api/xdas.md +++ b/docs/api/xdas.md @@ -168,161 +168,3 @@ Methods DataSequence.map ``` -### Coordinates - -Constructor - - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinates -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinates.isdim - Coordinates.get_query - Coordinates.to_index - Coordinates.equals - Coordinates.to_dict - Coordinates.copy - Coordinates.drop_dims - Coordinates.drop_coords -``` - -### Coordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinate -``` - -Attributes - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinate.dtype - Coordinate.ndim - Coordinate.shape - Coordinate.values -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - Coordinate.to_index - Coordinate.isscalar - Coordinate.isdense - Coordinate.isinterp -``` - - -### ScalarCoordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - ScalarCoordinate -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - ScalarCoordinate.isvalid - ScalarCoordinate.equals - ScalarCoordinate.to_index - ScalarCoordinate.to_dict -``` - -### DenseCoordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - DenseCoordinate -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - DenseCoordinate.isvalid - DenseCoordinate.index - DenseCoordinate.get_indexer - DenseCoordinate.slice_indexer - DenseCoordinate.to_dict -``` - -### InterpCoordinate - -Constructor - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - InterpCoordinate -``` - -Attributes - - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - InterpCoordinate.tie_indices - InterpCoordinate.tie_values - InterpCoordinate.empty - InterpCoordinate.dtype - InterpCoordinate.ndim - InterpCoordinate.shape - InterpCoordinate.indices - InterpCoordinate.values -``` - -Methods - -```{eval-rst} -.. autosummary:: - :toctree: ../_autosummary - - InterpCoordinate.isvalid - InterpCoordinate.equals - InterpCoordinate.get_value - InterpCoordinate.format_index - InterpCoordinate.slice_index - InterpCoordinate.get_indexer - InterpCoordinate.slice_indexer - InterpCoordinate.decimate - InterpCoordinate.simplify - InterpCoordinate.get_discontinuities - InterpCoordinate.from_array - InterpCoordinate.to_dict -``` \ No newline at end of file From 6c2b21da388cb6d94dcc44fe2971240a3172dd51 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 18:44:29 +0100 Subject: [PATCH 073/104] WIP: Making a coordinates docs tree. --- docs/getting-started.md | 2 +- docs/user-guide/coordinates/index.md | 3 +++ docs/user-guide/{ => coordinates}/interpolated-coordinates.md | 4 ++-- .../{data-structure => data-structures}/dataarray.md | 4 ++-- .../{data-structure => data-structures}/datacollection.md | 0 docs/user-guide/{data-structure => data-structures}/index.md | 2 +- docs/user-guide/index.md | 4 ++-- docs/user-guide/virtual-datasets.md | 2 +- 8 files changed, 12 insertions(+), 9 deletions(-) create mode 100644 docs/user-guide/coordinates/index.md rename docs/user-guide/{ => coordinates}/interpolated-coordinates.md (98%) rename docs/user-guide/{data-structure => data-structures}/dataarray.md (95%) rename docs/user-guide/{data-structure => data-structures}/datacollection.md (100%) rename docs/user-guide/{data-structure => data-structures}/index.md (98%) diff --git a/docs/getting-started.md b/docs/getting-started.md index edff32ef..f717052a 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -65,7 +65,7 @@ da Xdas only loads the metadata from each file and returns a {py:class}`~xdas.DataArray` object. This object has mainly two attributes. First a `data` attribute that contain the data. Here a {py:class}`~xdas.VirtualStack` object that is a pointer to the different files we opened. Second, a `coords` attribute that contains the metadata related to how the space and the time are sampled. Here both dimensions are labeled using {py:class}`~xdas.InterpCoordinate` objects. Those allow to concisely store the time and space information, including potential gaps and overlaps. See the [](user-guide/interpolated-coordinates) section for more information. -Note that if you want to create a single data collection object for multiple acquisitions (i.e. different instruments or several acquisition with different parameters), you can use the [DataCollection](user-guide/data-structure/datacollection) structure. +Note that if you want to create a single data collection object for multiple acquisitions (i.e. different instruments or several acquisition with different parameters), you can use the [DataCollection](user-guide/data-structures/datacollection) structure. ```{note} For Febus users, converting native files into Xdas NetCDF format generally improves I/O operations and reduce the amount of data by a factor two. This can be done by looping over Febus files and running: `xdas.open_dataarray("path_to_febus_file.h5", engine="febus").to_netcdf("path_to_xdas_file.nc", virtual=False)`. The converted files can then be linked as described above. diff --git a/docs/user-guide/coordinates/index.md b/docs/user-guide/coordinates/index.md new file mode 100644 index 00000000..12770890 --- /dev/null +++ b/docs/user-guide/coordinates/index.md @@ -0,0 +1,3 @@ +# Coordinates + +TODO \ No newline at end of file diff --git a/docs/user-guide/interpolated-coordinates.md b/docs/user-guide/coordinates/interpolated-coordinates.md similarity index 98% rename from docs/user-guide/interpolated-coordinates.md rename to docs/user-guide/coordinates/interpolated-coordinates.md index d4821f87..6ae7aee9 100644 --- a/docs/user-guide/interpolated-coordinates.md +++ b/docs/user-guide/coordinates/interpolated-coordinates.md @@ -59,7 +59,7 @@ is to `simplify` the coordinates, increasing the tolerance such that the overlap disappear. ``` -# Gaps and Overlaps +## Gaps and Overlaps Gaps and Overlaps can be easily identified based on the tie point positions, and extracted with: @@ -79,7 +79,7 @@ coord = coord.simplify(tolerance=0.0) coord ``` -# Temporal Coordinates +## Temporal Coordinates The main use of coordinates in *xdas* is to deal with long time series. By default *xdas* uses `"datetime64[us]"` dtype. Microseconds are used because to perform diff --git a/docs/user-guide/data-structure/dataarray.md b/docs/user-guide/data-structures/dataarray.md similarity index 95% rename from docs/user-guide/data-structure/dataarray.md rename to docs/user-guide/data-structures/dataarray.md index 7d7a62e6..9e48ff4e 100644 --- a/docs/user-guide/data-structure/dataarray.md +++ b/docs/user-guide/data-structures/dataarray.md @@ -16,7 +16,7 @@ os.chdir("../../_data") {py:class}`~xdas.DataArray` is the base class to load and manipulate big datasets to in *xdas*. It is mainly composed of two attributes: - `data`: any N-dimensional array-like object. Compared to *xarray* `xdas.DataArray` are more permissive to the kinds of array-like objects that can be used. In particular, [virtual arrays](../virtual-datasets) can be used. -- `coords`: a dict-like container of coordinates. As opposed to *xarray*, which uses dense arrays to label each point, *xdas* also implements [interpolated coordinates](../interpolated-coordinates) that provides an efficient representation of evenly spaced data (gracefully handling gaps and small sampling variations). +- `coords`: a dict-like container of coordinates. As opposed to *xarray*, which uses dense arrays to label each point, *xdas* also implements [interpolated coordinates](../coordinates/interpolated-coordinates) that provides an efficient representation of evenly spaced data (gracefully handling gaps and small sampling variations). ![](/_static/dataarray.svg) @@ -30,7 +30,7 @@ In the following examples, we use only one `DataArray`, if you have several `Dat ## Creating a DataArray -The user can wrap together an n-dimensional array and some related coordinates. See the related description of how to create coordinates [here](../interpolated-coordinates.md). For example: +The user can wrap together an n-dimensional array and some related coordinates. See the related description of how to create coordinates [here](../coordinates/interpolated-coordinates.md). For example: ```{code-cell} diff --git a/docs/user-guide/data-structure/datacollection.md b/docs/user-guide/data-structures/datacollection.md similarity index 100% rename from docs/user-guide/data-structure/datacollection.md rename to docs/user-guide/data-structures/datacollection.md diff --git a/docs/user-guide/data-structure/index.md b/docs/user-guide/data-structures/index.md similarity index 98% rename from docs/user-guide/data-structure/index.md rename to docs/user-guide/data-structures/index.md index 3745bc91..bc158267 100644 --- a/docs/user-guide/data-structure/index.md +++ b/docs/user-guide/data-structures/index.md @@ -1,4 +1,4 @@ -# Data Structure +# Data Structures Xdas leverages two main data structures. diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index 4ab42147..b8323343 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -3,10 +3,10 @@ ```{toctree} :maxdepth: 1 -data-structure/index +data-structures/index +coordinates/index data-formats virtual-datasets -interpolated-coordinates miniseed convert-displacement atoms diff --git a/docs/user-guide/virtual-datasets.md b/docs/user-guide/virtual-datasets.md index 752b14a7..f271d99a 100644 --- a/docs/user-guide/virtual-datasets.md +++ b/docs/user-guide/virtual-datasets.md @@ -49,7 +49,7 @@ To handle individual files, multiple files, and virtual datasets, *xdas* offers | {py:func}`xdas.open_mfdatatree` | {py:class}`~xdas.DataCollection` | Open a directory tree of files, organizing data in a data collection. | | {py:func}`xdas.open_datacollection` | {py:class}`~xdas.DataCollection` | Open a (virtual) collection. | -Please refer to the [](data-structure/datacollection.md) section for the functions that return a data collection. +Please refer to the [](data-structures/datacollection.md) section for the functions that return a data collection. ## Linking multi-file datasets From 3f06a9e7bb519e116b45fa90c060bd5d58f8c3a6 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 19:31:42 +0100 Subject: [PATCH 074/104] Add some doc about coordinate system. --- docs/user-guide/coordinates/index.md | 44 ++++++++++++++++++- .../coordinates/sampled-coordinates.md | 9 ++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 docs/user-guide/coordinates/sampled-coordinates.md diff --git a/docs/user-guide/coordinates/index.md b/docs/user-guide/coordinates/index.md index 12770890..8efd8dc5 100644 --- a/docs/user-guide/coordinates/index.md +++ b/docs/user-guide/coordinates/index.md @@ -1,3 +1,45 @@ +--- +file_format: mystnb +kernelspec: + name: python3 +--- + # Coordinates -TODO \ No newline at end of file + +{py:class}`~xdas.DataArray` is the base class in *xdas*. It is mainly composed of a N-dimensional array and of a set of {py:class}`~xdas.Coordinate` objects that are gathered in a {py:class}`~xdas.Coordinates` dict-like object than can be accessed by the `DataArray.coords` attribute. Xdas comme with several flavours of {py:class}`~xdas.Coordinate` objects. + +| Type | Description | `data` | +|:---|:---|:---:| +| {py:class}`~xdas.coordinates.ScalarCoordinate` | Used to label 0D dimensions | `{"value": any}` | +| {py:class}`~xdas.coordinates.DefaultCoordinate` | Each value is equal to its index | `{"size": int}` | +| {py:class}`~xdas.coordinates.DenseCoordinate` | Each index is mapped to a given value | `array-like[any]` | +| {py:class}`~xdas.coordinates.InterpCoordinate` | Values are interpolated linearly between tie points | `{"tie_indices": array-like[int], "tie_values": array-like[any]}` | +| {py:class}`~xdas.coordinates.SampledCoordinate` | Values are given as a multiple of a fixed sampling interval and several start values | `{"tie_values": array-like[any], "tie_indices": array-like[int], "sampling_interval": any}` | + +## Per type information + +```{toctree} +:maxdepth: 1 + +interpolated-coordinates +sampled-coordinates +``` + + \ No newline at end of file diff --git a/docs/user-guide/coordinates/sampled-coordinates.md b/docs/user-guide/coordinates/sampled-coordinates.md new file mode 100644 index 00000000..e02500ff --- /dev/null +++ b/docs/user-guide/coordinates/sampled-coordinates.md @@ -0,0 +1,9 @@ +--- +file_format: mystnb +kernelspec: + name: python3 +--- + +# Sampled Coordinates + +TODO \ No newline at end of file From c3e6798fc0b6e53b688eaff1080c0e325e08f72e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 19:34:28 +0100 Subject: [PATCH 075/104] Update coordinate documentation to include 'name' column in type descriptions --- docs/user-guide/coordinates/index.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/user-guide/coordinates/index.md b/docs/user-guide/coordinates/index.md index 8efd8dc5..59689240 100644 --- a/docs/user-guide/coordinates/index.md +++ b/docs/user-guide/coordinates/index.md @@ -9,13 +9,13 @@ kernelspec: {py:class}`~xdas.DataArray` is the base class in *xdas*. It is mainly composed of a N-dimensional array and of a set of {py:class}`~xdas.Coordinate` objects that are gathered in a {py:class}`~xdas.Coordinates` dict-like object than can be accessed by the `DataArray.coords` attribute. Xdas comme with several flavours of {py:class}`~xdas.Coordinate` objects. -| Type | Description | `data` | -|:---|:---|:---:| -| {py:class}`~xdas.coordinates.ScalarCoordinate` | Used to label 0D dimensions | `{"value": any}` | -| {py:class}`~xdas.coordinates.DefaultCoordinate` | Each value is equal to its index | `{"size": int}` | -| {py:class}`~xdas.coordinates.DenseCoordinate` | Each index is mapped to a given value | `array-like[any]` | -| {py:class}`~xdas.coordinates.InterpCoordinate` | Values are interpolated linearly between tie points | `{"tie_indices": array-like[int], "tie_values": array-like[any]}` | -| {py:class}`~xdas.coordinates.SampledCoordinate` | Values are given as a multiple of a fixed sampling interval and several start values | `{"tie_values": array-like[any], "tie_indices": array-like[int], "sampling_interval": any}` | +| Type | Description | `name` | `data` | +|:---|:---|:---:|:---:| +| {py:class}`~xdas.coordinates.ScalarCoordinate` | Used to label 0D dimensions | `scalar` | `{"value": any}` | +| {py:class}`~xdas.coordinates.DefaultCoordinate` | Each value is equal to its index | `default` | `{"size": int}` | +| {py:class}`~xdas.coordinates.DenseCoordinate` | Each index is mapped to a given value | `dense` | `array-like[any]` | +| {py:class}`~xdas.coordinates.InterpCoordinate` | Values are interpolated linearly between tie points | `interpolated` | `{"tie_indices": array-like[int], "tie_values": array-like[any]}` | +| {py:class}`~xdas.coordinates.SampledCoordinate` | Values are given as a multiple of a fixed sampling interval and several start values | `sampled` | `{"tie_values": array-like[any], "tie_indices": array-like[int], "sampling_interval": any}` | ## Per type information From 556ee949ea5dc1214dad620697232909f45c84fb Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 16 Jan 2026 19:37:09 +0100 Subject: [PATCH 076/104] Fix typo in coordinates documentation and clarify information location --- docs/user-guide/coordinates/index.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/user-guide/coordinates/index.md b/docs/user-guide/coordinates/index.md index 59689240..62cb7a4f 100644 --- a/docs/user-guide/coordinates/index.md +++ b/docs/user-guide/coordinates/index.md @@ -17,6 +17,8 @@ kernelspec: | {py:class}`~xdas.coordinates.InterpCoordinate` | Values are interpolated linearly between tie points | `interpolated` | `{"tie_indices": array-like[int], "tie_values": array-like[any]}` | | {py:class}`~xdas.coordinates.SampledCoordinate` | Values are given as a multiple of a fixed sampling interval and several start values | `sampled` | `{"tie_values": array-like[any], "tie_indices": array-like[int], "sampling_interval": any}` | +In the current state fo the documentation most of the coordinate information can be found in the [Interpolated Coordinate](interpolated-coordinates) page. + ## Per type information ```{toctree} From 0ebb613c59f44dcb9c5a515d8c1900fa935996cb Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Tue, 20 Jan 2026 18:33:58 +0100 Subject: [PATCH 077/104] Minor formating. --- xdas/__init__.py | 6 +----- xdas/coordinates/__init__.py | 6 +----- xdas/core/dataarray.py | 3 ++- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/xdas/__init__.py b/xdas/__init__.py index 9326541c..e98f9d11 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -10,11 +10,7 @@ synthetics, virtual, ) -from .coordinates import ( - Coordinate, - Coordinates, - get_sampling_interval, -) +from .coordinates import Coordinate, Coordinates, get_sampling_interval from .core import dataarray, datacollection, methods, numpy, routines from .core.dataarray import DataArray from .core.datacollection import DataCollection, DataMapping, DataSequence diff --git a/xdas/coordinates/__init__.py b/xdas/coordinates/__init__.py index 09f17353..f7eaeaee 100644 --- a/xdas/coordinates/__init__.py +++ b/xdas/coordinates/__init__.py @@ -1,8 +1,4 @@ -from .core import ( - Coordinate, - Coordinates, - get_sampling_interval, -) +from .core import Coordinate, Coordinates, get_sampling_interval from .default import DefaultCoordinate from .dense import DenseCoordinate from .interp import InterpCoordinate diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index c8efebcd..5100cee4 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -129,7 +129,8 @@ def __array__(self, dtype=None): return self.data.__array__(dtype) def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - from .routines import broadcast_coords, broadcast_to # TODO: circular import + from .routines import broadcast_coords # TODO: circular import + from .routines import broadcast_to if not method == "__call__": return NotImplemented From 7175e1aa7ecc68a3bcc5f095d09224bbb736096d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Mon, 16 Feb 2026 18:08:42 +0100 Subject: [PATCH 078/104] Function to split data array into chunks according to available memory limit --- xdas/__init__.py | 1 + xdas/core/routines.py | 75 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/xdas/__init__.py b/xdas/__init__.py index e9a85cb5..b3d1f156 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -25,5 +25,6 @@ open_mfdatacollection, open_mfdatatree, plot_availability, + fit_into_memory, split, ) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index bc9927f3..6639a39e 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -4,6 +4,7 @@ from collections import defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed from glob import glob +import psutil import numpy as np import pandas as pd @@ -741,6 +742,80 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): return DataArray(data, coords, dims, name, attrs) +import sys +from types import ModuleType, FunctionType +from gc import get_referents + +# Custom objects know their class. +# Function objects seem to know way too much, including modules. +# Exclude modules as well. +BLACKLIST = type, ModuleType, FunctionType + +def getsize(obj): + """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714 """ + if isinstance(obj, BLACKLIST): + raise TypeError('getsize() does not take argument of type: '+ str(type(obj))) + seen_ids = set() + size = 0 + objects = [obj] + while objects: + need_referents = [] + for obj in objects: + if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids: + seen_ids.add(id(obj)) + size += sys.getsizeof(obj) + need_referents.append(obj) + objects = get_referents(*need_referents) + return size + +def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinuities", dim="first", tolerance=None): + """ + Check if a data array is too large to fit into a memory limit and split it if it is the case. + + Splitting can either be performed at each discontinuity (along interpolated + coordinates), at a given set of indices (give as a list of int) or in order to get + a given number of equal sized chunks (if a single int is provided). + + Parameters + ---------- + da : DataArray + The data array to split + RAM_limit : float, optional + Ratio of the available memory to not exceed. + indices_or_sections : str, int or list of int, optional + If `indices_or_section` is an integer N, the array will be divided into N + almost equal (can differ by one element if the `dim` size is not a multiple of + N). If `indices_or_section` is a 1-D array of sorted integers, the entries + indicate where the array is split along `dim`. For example, `[2, 3]` would, for + `dim="first"`, result in [da[:2], da[2:3], da[3:]]. If `indices_or_section` is + "discontinuities", the `dim` must be an interpolated coordinate and splitting + will occurs at locations where they are two consecutive tie_indices with only + one index of difference and where the tie_values difference is greater than + `tolerance`. Default to "discontinuities". + dim : str, optional + The dimension along which to split, by default "first" + tolerance : float or timedelta64, optional + If `indices_or_sections="discontinuities"` split will only occur on gaps and + overlaps that are bigger than `tolerance`. Zero tolerance by default. + + Returns + ------- + list of DataArray + The splitted data array. + """ + + available_RAM = psutil.virtual_memory().available # in bytes + n_chunks = 1+int(getsize(da) / (RAM_limit * available_RAM)) + + # n_chunks = max(2, n_chunks_to_fit) # make at least 2 chunks even if fitting into memory? + + # print(f"DataArray size : {getsize(da)/1e9} MB") + # print(f"Available RAM : {available_RAM/1e9} MB") + # print(f"Memory limit : {RAM_limit * available_RAM/1e9} MB") + # print("n_chunks : ", n_chunks) + + return split(da, n_chunks, dim, tolerance) + def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None): """ From 77d61d4fb341b58dbacf6ea3827aa2a75c78bf63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Falco?= Date: Tue, 17 Feb 2026 15:05:25 +0100 Subject: [PATCH 079/104] add psutil as a dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2ddc83ca..98e4a156 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "obspy", "pandas", "plotly", + "psutil", "scipy", "tqdm", "watchdog", From 5b59d65732f79fe629de7008096a8ce43b45742e Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 14:41:53 +0100 Subject: [PATCH 080/104] Fix obspy 1.4.2 compatibility. --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 2ddc83ca..6c2d19f8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ dependencies = [ "obspy", "pandas", "plotly", + "setuptools<82.0.0", "scipy", "tqdm", "watchdog", From 630b254dbd3229c33ed7e114d08bdfb1310a9987 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 14:42:53 +0100 Subject: [PATCH 081/104] Fix numpy 2.4 incompatibility. --- docs/release-notes.md | 13 ++++++------- tests/test_numpy.py | 5 ++++- xdas/core/numpy.py | 7 ++++++- 3 files changed, 16 insertions(+), 9 deletions(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index 0dd5b92a..82f4eeff 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,11 +1,14 @@ # Release notes +## 0.2.5 +- Fix numpy 2.4 and obspy 1.4.2 incompatibilities (@atrabatto). + ## 0.2.4 - Add StreamWriter to write long time series to miniSEED (@marbail). - Fix OptaSense engine wrong axis attribution (@smouellet). -- Fix ASN (OptoDAS) engine: handling of roiDec (@AndresLaurine) -- Fix nan handling for several methods (@ClaudioStrumia) -- Fix InterpCoordinate.get_availabilities (@AMordret) +- Fix ASN (OptoDAS) engine: handling of roiDec (@AndresLaurine). +- Fix nan handling for several methods (@ClaudioStrumia). +- Fix `InterpCoordinate.get_availabilities` (@AMordret). ## 0.2.3 - Fix Febus engine (round timestamps to closest us). @@ -25,23 +28,19 @@ - Fix applying fft functions in presence of non-dimensional coordinates. ## 0.2 - - Add Dask virtualization backend for non-HDF5 formats (@atrabattoni). - Add support for miniSEED format (@atrabattoni, @chauvetige). - Add support for Silixa (TDMS) format (@atrabattoni, @Stutzmann). ## 0.1.2 - - Add ZeroMQ streaming capabilities (@atrabattoni). - Add support of Terra15 format (@chauvetige). - Fix Febus engine (@ClaudioStrumia). ## 0.1.1 - - Add support for `hdf5plugin` compression schemes. - Drop `netCDF4` dependency and only use `h5netcdf` to fix incompatibilities. - Drop useless `dask` dependency. ## 0.1 - Initial stable version. \ No newline at end of file diff --git a/tests/test_numpy.py b/tests/test_numpy.py index 235aa969..0090303a 100644 --- a/tests/test_numpy.py +++ b/tests/test_numpy.py @@ -51,12 +51,15 @@ def test_returns_dataarray(self): if numpy_function == np.clip: result = numpy_function(da, -1, 1) assert isinstance(result, DataArray) - elif numpy_function == np.trapz: + elif numpy_function.__name__ == "trapz": if np.lib.NumpyVersion(np.__version__) >= "2.0.0b1": pass # TODO: this function is deprecated, let it go... else: result = numpy_function(da) assert isinstance(result, np.ndarray) + elif numpy_function.__name__ == "trapezoid": + result = numpy_function(da) + assert isinstance(result, np.ndarray) elif numpy_function in [np.diff, np.ediff1d]: result = numpy_function(da) assert isinstance(result, np.ndarray) diff --git a/xdas/core/numpy.py b/xdas/core/numpy.py index 59655ef4..3319329d 100644 --- a/xdas/core/numpy.py +++ b/xdas/core/numpy.py @@ -1,6 +1,7 @@ from inspect import signature import numpy as np +from numpy.lib import NumpyVersion from .dataarray import HANDLED_NUMPY_FUNCTIONS, DataArray @@ -112,6 +113,10 @@ def wrapper(*args, **kwargs): handled(drop_coords=True) handled(drop_coords=True)(np.diff) handled(drop_coords=True)(np.ediff1d) -handled(drop_coords=True)(np.trapz) + +if NumpyVersion(np.__version__) < "2.4.0": + handled(drop_coords=True)(np.trapz) +if NumpyVersion(np.__version__) >= "2.0.0": + handled(drop_coords=True)(np.trapezoid) # TODO: gradient From 96b6dc27360e46fc8f0ddacf0dc0af0b594fc3db Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 15:00:52 +0100 Subject: [PATCH 082/104] Add direct import path to coordinates for Seisbench. --- xdas/__init__.py | 11 ++++++++++- xdas/io/tdms.py | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/xdas/__init__.py b/xdas/__init__.py index e98f9d11..8c6c5be1 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -10,7 +10,16 @@ synthetics, virtual, ) -from .coordinates import Coordinate, Coordinates, get_sampling_interval +from .coordinates import ( + Coordinate, + Coordinates, + DefaultCoordinate, + DenseCoordinate, + InterpCoordinate, + SampledCoordinate, + ScalarCoordinate, + get_sampling_interval, +) from .core import dataarray, datacollection, methods, numpy, routines from .core.dataarray import DataArray from .core.datacollection import DataCollection, DataMapping, DataSequence diff --git a/xdas/io/tdms.py b/xdas/io/tdms.py index 30e444bd..c2b94305 100644 --- a/xdas/io/tdms.py +++ b/xdas/io/tdms.py @@ -455,7 +455,7 @@ def _initialise_data(self): self._seg2_length = 0 else: self._tdms_file.seek(nso + 12, 0) - (seg2_nso, seg2_rdo) = struct.unpack(" Date: Thu, 5 Mar 2026 15:13:37 +0100 Subject: [PATCH 083/104] Version 0.2.5. --- docs/conf.py | 2 +- pyproject.toml | 2 +- tests/test_xdas.py | 9 +++++++++ xdas/__init__.py | 2 ++ 4 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 tests/test_xdas.py diff --git a/docs/conf.py b/docs/conf.py index 7b0e46d2..e3b8a9fd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -11,7 +11,7 @@ author = "Alister Trabattoni" # The full version, including alpha/beta/rc tags -release = "0.2.4" +release = "0.2.5" # -- General configuration --------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 6c2d19f8..b8585569 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "xdas" -version = "0.2.4" +version = "0.2.5" requires-python = ">= 3.10" authors = [ { name = "Alister Trabattoni", email = "alister.trabattoni@gmail.com" }, diff --git a/tests/test_xdas.py b/tests/test_xdas.py new file mode 100644 index 00000000..3af18b0b --- /dev/null +++ b/tests/test_xdas.py @@ -0,0 +1,9 @@ +import xdas as xd + + +def test_version(): + version = xd.__version__ + assert isinstance(version, str) + version_parts = version.split(".") + for part in version_parts: + assert part.isdigit() diff --git a/xdas/__init__.py b/xdas/__init__.py index 8c6c5be1..5f281fc4 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -1,3 +1,5 @@ +__version__ = "0.2.5" + from . import ( atoms, config, From 661521a97672d08a53a4eb99c35ee3ecea4f5575 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 15:30:07 +0100 Subject: [PATCH 084/104] Formattting. --- xdas/__init__.py | 2 +- xdas/core/dataarray.py | 6 ++++-- xdas/core/datacollection.py | 4 +++- xdas/core/routines.py | 27 ++++++++++++++++++--------- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/xdas/__init__.py b/xdas/__init__.py index dd5944d4..908e0276 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -34,12 +34,12 @@ combine_by_coords, combine_by_field, concatenate, + fit_into_memory, open_dataarray, open_datacollection, open_mfdataarray, open_mfdatacollection, open_mfdatatree, plot_availability, - fit_into_memory, split, ) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index 43a5f5e1..e2d5f3df 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -1,7 +1,7 @@ import copy +import os import warnings from functools import partial -import os import h5netcdf import h5py @@ -885,7 +885,9 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) # write data - if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): + if os.path.dirname(fname) is not "" and not os.path.exists( + os.path.dirname(fname) + ): os.makedirs(os.path.dirname(fname), exist_ok=True) with h5netcdf.File(fname, mode=mode) as file: diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index a2df095f..f9adf14a 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -239,7 +239,9 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) - if os.path.dirname(fname) is not "" and not os.path.exists(os.path.dirname(fname)): + if os.path.dirname(fname) is not "" and not os.path.exists( + os.path.dirname(fname) + ): os.makedirs(os.path.dirname(fname), exist_ok=True) self[key].to_netcdf( fname, diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 3b3c6d59..ffbfca6a 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -5,11 +5,11 @@ from concurrent.futures import ProcessPoolExecutor, as_completed from glob import glob from itertools import pairwise -import psutil import numpy as np import pandas as pd import plotly.express as px +import psutil import xarray as xr from tqdm import tqdm @@ -743,19 +743,21 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): return DataArray(data, coords, dims, name, attrs) + import sys -from types import ModuleType, FunctionType from gc import get_referents +from types import FunctionType, ModuleType # Custom objects know their class. # Function objects seem to know way too much, including modules. # Exclude modules as well. BLACKLIST = type, ModuleType, FunctionType + def getsize(obj): - """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714 """ + """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714""" if isinstance(obj, BLACKLIST): - raise TypeError('getsize() does not take argument of type: '+ str(type(obj))) + raise TypeError("getsize() does not take argument of type: " + str(type(obj))) seen_ids = set() size = 0 objects = [obj] @@ -769,7 +771,14 @@ def getsize(obj): objects = get_referents(*need_referents) return size -def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinuities", dim="first", tolerance=None): + +def fit_into_memory( + da, + RAM_limit: float = 0.8, + indices_or_sections="discontinuities", + dim="first", + tolerance=None, +): """ Check if a data array is too large to fit into a memory limit and split it if it is the case. @@ -782,7 +791,7 @@ def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinu da : DataArray The data array to split RAM_limit : float, optional - Ratio of the available memory to not exceed. + Ratio of the available memory to not exceed. indices_or_sections : str, int or list of int, optional If `indices_or_section` is an integer N, the array will be divided into N almost equal (can differ by one element if the `dim` size is not a multiple of @@ -805,9 +814,9 @@ def fit_into_memory(da, RAM_limit : float = .8 , indices_or_sections="discontinu The splitted data array. """ - available_RAM = psutil.virtual_memory().available # in bytes - n_chunks = 1+int(getsize(da) / (RAM_limit * available_RAM)) - + available_RAM = psutil.virtual_memory().available # in bytes + n_chunks = 1 + int(getsize(da) / (RAM_limit * available_RAM)) + # n_chunks = max(2, n_chunks_to_fit) # make at least 2 chunks even if fitting into memory? # print(f"DataArray size : {getsize(da)/1e9} MB") From 1bb11085b40889ecb064d8513133441f51281ab4 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 15:44:05 +0100 Subject: [PATCH 085/104] Remove auto RAM chunsize related things. --- pyproject.toml | 1 - xdas/__init__.py | 1 - xdas/core/routines.py | 84 ------------------------------------------- 3 files changed, 86 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7d5c4ee5..b8585569 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ dependencies = [ "obspy", "pandas", "plotly", - "psutil", "setuptools<82.0.0", "scipy", "tqdm", diff --git a/xdas/__init__.py b/xdas/__init__.py index 908e0276..5f281fc4 100644 --- a/xdas/__init__.py +++ b/xdas/__init__.py @@ -34,7 +34,6 @@ combine_by_coords, combine_by_field, concatenate, - fit_into_memory, open_dataarray, open_datacollection, open_mfdataarray, diff --git a/xdas/core/routines.py b/xdas/core/routines.py index ffbfca6a..c3a75dcb 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -9,7 +9,6 @@ import numpy as np import pandas as pd import plotly.express as px -import psutil import xarray as xr from tqdm import tqdm @@ -744,89 +743,6 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): return DataArray(data, coords, dims, name, attrs) -import sys -from gc import get_referents -from types import FunctionType, ModuleType - -# Custom objects know their class. -# Function objects seem to know way too much, including modules. -# Exclude modules as well. -BLACKLIST = type, ModuleType, FunctionType - - -def getsize(obj): - """sum size of object & members. See https://stackoverflow.com/a/30316760/12774714""" - if isinstance(obj, BLACKLIST): - raise TypeError("getsize() does not take argument of type: " + str(type(obj))) - seen_ids = set() - size = 0 - objects = [obj] - while objects: - need_referents = [] - for obj in objects: - if not isinstance(obj, BLACKLIST) and id(obj) not in seen_ids: - seen_ids.add(id(obj)) - size += sys.getsizeof(obj) - need_referents.append(obj) - objects = get_referents(*need_referents) - return size - - -def fit_into_memory( - da, - RAM_limit: float = 0.8, - indices_or_sections="discontinuities", - dim="first", - tolerance=None, -): - """ - Check if a data array is too large to fit into a memory limit and split it if it is the case. - - Splitting can either be performed at each discontinuity (along interpolated - coordinates), at a given set of indices (give as a list of int) or in order to get - a given number of equal sized chunks (if a single int is provided). - - Parameters - ---------- - da : DataArray - The data array to split - RAM_limit : float, optional - Ratio of the available memory to not exceed. - indices_or_sections : str, int or list of int, optional - If `indices_or_section` is an integer N, the array will be divided into N - almost equal (can differ by one element if the `dim` size is not a multiple of - N). If `indices_or_section` is a 1-D array of sorted integers, the entries - indicate where the array is split along `dim`. For example, `[2, 3]` would, for - `dim="first"`, result in [da[:2], da[2:3], da[3:]]. If `indices_or_section` is - "discontinuities", the `dim` must be an interpolated coordinate and splitting - will occurs at locations where they are two consecutive tie_indices with only - one index of difference and where the tie_values difference is greater than - `tolerance`. Default to "discontinuities". - dim : str, optional - The dimension along which to split, by default "first" - tolerance : float or timedelta64, optional - If `indices_or_sections="discontinuities"` split will only occur on gaps and - overlaps that are bigger than `tolerance`. Zero tolerance by default. - - Returns - ------- - list of DataArray - The splitted data array. - """ - - available_RAM = psutil.virtual_memory().available # in bytes - n_chunks = 1 + int(getsize(da) / (RAM_limit * available_RAM)) - - # n_chunks = max(2, n_chunks_to_fit) # make at least 2 chunks even if fitting into memory? - - # print(f"DataArray size : {getsize(da)/1e9} MB") - # print(f"Available RAM : {available_RAM/1e9} MB") - # print(f"Memory limit : {RAM_limit * available_RAM/1e9} MB") - # print("n_chunks : ", n_chunks) - - return split(da, n_chunks, dim, tolerance) - - def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None): """ Split a data array along a dimension. From cb0a93b4a59ea9a5b894fbd85e8ae6587ecc3121 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 16:05:27 +0100 Subject: [PATCH 086/104] Add `create_dirs` kwarg. --- xdas/core/dataarray.py | 23 +++++++++++++++++------ xdas/core/datacollection.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/xdas/core/dataarray.py b/xdas/core/dataarray.py index e2d5f3df..1fa2cd94 100644 --- a/xdas/core/dataarray.py +++ b/xdas/core/dataarray.py @@ -826,7 +826,15 @@ def from_stream(cls, st, dims=("channel", "time")): } return cls(data, {dims[0]: channel, dims[1]: time}) - def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): + def to_netcdf( + self, + fname, + mode="w", + group=None, + virtual=None, + encoding=None, + create_dirs=False, + ): """ Write DataArray contents to a netCDF file. @@ -850,6 +858,8 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): the `h5netcdf` engine to write the data. If you want to use a specific plugin for compression, you can use the `hdf5plugin` package. For example, to use the ZFP compression, you can use the `hdf5plugin.Zfp` class. + create_dirs : bool, optional + Whether to create parent directories if they do not exist. Default is False. Examples -------- @@ -884,12 +894,13 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): for coord in self.coords.values(): dataset, variable_attrs = coord.to_dataset(dataset, variable_attrs) - # write data - if os.path.dirname(fname) is not "" and not os.path.exists( - os.path.dirname(fname) - ): - os.makedirs(os.path.dirname(fname), exist_ok=True) + # create parent directories if needed + if create_dirs: + dirname = os.path.dirname(fname) + if dirname: + os.makedirs(dirname, exist_ok=True) + # write data with h5netcdf.File(fname, mode=mode) as file: # group if group is not None and group not in file: diff --git a/xdas/core/datacollection.py b/xdas/core/datacollection.py index f9adf14a..aafef713 100644 --- a/xdas/core/datacollection.py +++ b/xdas/core/datacollection.py @@ -231,7 +231,15 @@ def fields(self): ) return uniquifiy(out) - def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): + def to_netcdf( + self, + fname, + mode="w", + group=None, + virtual=None, + encoding=None, + create_dirs=False, + ): if mode == "w" and group is None and os.path.exists(fname): os.remove(fname) for key in self: @@ -239,10 +247,10 @@ def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): location = "/".join([name, str(key)]) if group is not None: location = "/".join([group, location]) - if os.path.dirname(fname) is not "" and not os.path.exists( - os.path.dirname(fname) - ): - os.makedirs(os.path.dirname(fname), exist_ok=True) + if create_dirs: + dirname = os.path.dirname(fname) + if dirname: + os.makedirs(dirname, exist_ok=True) self[key].to_netcdf( fname, mode="a", @@ -445,9 +453,22 @@ def to_mapping(self): def from_mapping(cls, data): return cls(data.values(), data.name) - def to_netcdf(self, fname, mode="w", group=None, virtual=None, encoding=None): + def to_netcdf( + self, + fname, + mode="w", + group=None, + virtual=None, + encoding=None, + create_dirs=False, + ): self.to_mapping().to_netcdf( - fname, mode=mode, group=group, virtual=virtual, encoding=encoding + fname, + mode=mode, + group=group, + virtual=virtual, + encoding=encoding, + create_dirs=create_dirs, ) @classmethod From 0a1616b63915b07defb4624c036e0957eb581388 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 16:16:36 +0100 Subject: [PATCH 087/104] Add test to `create_dirs`. --- tests/test_dataarray.py | 10 ++++++++++ tests/test_datacollection.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/tests/test_dataarray.py b/tests/test_dataarray.py index 4a251f7e..acdabb35 100644 --- a/tests/test_dataarray.py +++ b/tests/test_dataarray.py @@ -458,6 +458,16 @@ def test_io_attrs(self): assert result.attrs == attrs assert result.equals(da) + def test_io_create_dirs(self): + da = xd.DataArray(np.arange(3)) + with TemporaryDirectory() as dirpath: + path = os.path.join(dirpath, "subdir", "tmp.nc") + with pytest.raises(FileNotFoundError, match="No such file or directory"): + da.to_netcdf(path) + da.to_netcdf(path, create_dirs=True) + result = xd.DataArray.from_netcdf(path) + assert result.equals(da) + def test_ufunc(self): da = wavelet_wavefronts() result = np.add(da, 1) diff --git a/tests/test_datacollection.py b/tests/test_datacollection.py index 7ebd2c0f..d70cf22c 100644 --- a/tests/test_datacollection.py +++ b/tests/test_datacollection.py @@ -68,6 +68,23 @@ def test_io(self): result = xd.open_datacollection(path) assert result.equals(dc) + def test_io_create_dirs(self): + da = wavelet_wavefronts() + dc = xd.DataCollection( + { + "das1": da, + "das2": da, + }, + "instrument", + ) + with TemporaryDirectory() as dirpath: + path = os.path.join(dirpath, "subdir", "tmp.nc") + with pytest.raises(FileNotFoundError, match="No such file or directory"): + dc.to_netcdf(path) + dc.to_netcdf(path, create_dirs=True) + result = xd.DataCollection.from_netcdf(path) + assert result.equals(dc) + def test_depth_counter(self): da = wavelet_wavefronts() da.name = "da" From 4547c6a7be9e064b894a85cb4b8f2c8873282a17 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 16:18:49 +0100 Subject: [PATCH 088/104] Add release note for `create_dirs`. --- docs/release-notes.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index aa8001a3..9234b933 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,8 +1,10 @@ # Release notes ## 0.2.5 -- Fix numpy 2.4 and obspy 1.4.2 incompatibilities (@atrabatto). - Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). +- Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). +- Fix numpy 2.4 and obspy 1.4.2 incompatibilities (@atrabatto). + ## 0.2.4 - Add StreamWriter to write long time series to miniSEED (@marbail). From 9f4f60b89a2d56066b27e5a088a6c30a4e07c593 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 16:48:21 +0100 Subject: [PATCH 089/104] Fix release note. --- docs/release-notes.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index 9234b933..2085940d 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -3,8 +3,7 @@ ## 0.2.5 - Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). - Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). -- Fix numpy 2.4 and obspy 1.4.2 incompatibilities (@atrabatto). - +- Fix numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto). ## 0.2.4 - Add StreamWriter to write long time series to miniSEED (@marbail). From 05ec6391b1df1f7d959bdf34eba645ecdd3ef06e Mon Sep 17 00:00:00 2001 From: Martijn van den Ende Date: Thu, 5 Mar 2026 17:03:36 +0100 Subject: [PATCH 090/104] Fixed discontinuity bug --- xdas/io/asn.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/xdas/io/asn.py b/xdas/io/asn.py index 7b41ba4d..b1c799be 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -40,12 +40,9 @@ def read(fname: str) -> DataArray: dist_tie_vals.append(float(all_dists[i])) # Repeat the procedure for the index/distance at which the ROI ends. - # A "discontinuity" in the interpolation scheme is created in the - # following way: n_roi = [start, stop-1, stop, start, stop-1, stop, ...] i = bisect_left(all_dists, n_end * dx) - for j in reversed(range(2)): - dist_tie_inds.append(i-j) - dist_tie_vals.append(float(all_dists[i-j])) + dist_tie_inds.append(i) + dist_tie_vals.append(float(all_dists[i])) nt = data.shape[0] time = {"tie_indices": [0, nt - 1], "tie_values": [t0, t0 + (nt - 1) * dt]} From c7bb40ee2f6fd169872d29b11f78ca2951cab6a5 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni <32540975+atrabattoni@users.noreply.github.com> Date: Thu, 5 Mar 2026 18:13:26 +0100 Subject: [PATCH 091/104] Fix error: use Martijn implementation. --- xdas/io/asn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xdas/io/asn.py b/xdas/io/asn.py index a818ffd3..747518cd 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -49,7 +49,7 @@ def read(fname, ctype=None): time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") if not ctype["distance"] == "interpolated": raise NotImplementedError("ctype must be 'interpolated' along the 'distance' dim") - distance = Coordinate[ctype["distance"]].from_block(0.0, nx, dx, dim="distance") + distance = {"tie_indices": dist_tie_inds, "tie_values": dist_tie_vals} return DataArray(data, {"time": time, "distance": distance}) From 1c4763c337fa968430c054448e4bee7df7685b95 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 18:24:08 +0100 Subject: [PATCH 092/104] Formatting. --- xdas/io/asn.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xdas/io/asn.py b/xdas/io/asn.py index 747518cd..e1a32530 100644 --- a/xdas/io/asn.py +++ b/xdas/io/asn.py @@ -48,7 +48,9 @@ def read(fname, ctype=None): nt = data.shape[0] time = Coordinate[ctype["time"]].from_block(t0, nt, dt, dim="time") if not ctype["distance"] == "interpolated": - raise NotImplementedError("ctype must be 'interpolated' along the 'distance' dim") + raise NotImplementedError( + "ctype must be 'interpolated' along the 'distance' dim" + ) distance = {"tie_indices": dist_tie_inds, "tie_values": dist_tie_vals} return DataArray(data, {"time": time, "distance": distance}) From 02cfb50a0c7c73aa66099e57199e9f628efa8cca Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 18:26:54 +0100 Subject: [PATCH 093/104] Add release note. --- docs/release-notes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes.md b/docs/release-notes.md index 2085940d..e9f92696 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -3,6 +3,7 @@ ## 0.2.5 - Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). - Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). +- Add support for multiple ROI for ASN engine (@martijnende). - Fix numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto). ## 0.2.4 From d327bcc93b90a5f8a48e5a8709d7ced64f6138e6 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 18:58:40 +0100 Subject: [PATCH 094/104] WIP. --- xdas/coordinates/core.py | 12 ++++++++++++ xdas/coordinates/interp.py | 14 ++++++++------ xdas/coordinates/sampled.py | 12 +++++++++--- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 4154467d..2e81bb86 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -590,6 +590,18 @@ def parse(data, dim=None): return data, dim +def parse_tolerance(tolerance, dtype): + if np.issubdtype(dtype, np.datetime64): + if tolerance is None: + tolerance = np.timedelta64(0, "ns") + elif isinstance(tolerance, (int, float)): + tolerance = np.timedelta64(round(tolerance * 1e9), "ns") + else: + if tolerance is None: + tolerance = 0.0 + return tolerance + + def get_sampling_interval(da, dim, cast=True): """ Returns the sample spacing along a given dimension. diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 741e4b43..46b8e31d 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -4,7 +4,13 @@ import pandas as pd from xinterp import forward, inverse -from .core import Coordinate, format_datetime, is_strictly_increasing, parse +from .core import ( + Coordinate, + format_datetime, + is_strictly_increasing, + parse, + parse_tolerance, +) class InterpCoordinate(Coordinate, name="interpolated"): @@ -286,11 +292,7 @@ def decimate(self, q): ) def simplify(self, tolerance=None): - if tolerance is None: - if np.issubdtype(self.dtype, np.datetime64): - tolerance = np.timedelta64(0, "ns") - else: - tolerance = 0.0 + tolerance = parse_tolerance(tolerance, self.dtype) tie_indices, tie_values = douglas_peucker( self.tie_indices, self.tie_values, tolerance ) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 34485eaf..a1b47e3a 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -2,7 +2,13 @@ import numpy as np -from .core import Coordinate, format_datetime, is_strictly_increasing, parse +from .core import ( + Coordinate, + format_datetime, + is_strictly_increasing, + parse, + parse_tolerance, +) CODE_TO_UNITS = { "h": "hours", @@ -384,8 +390,8 @@ def decimate(self, q): return self[::q] def simplify(self, tolerance=None): - if tolerance is None: - tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] + tolerance = parse_tolerance(tolerance, self.dtype) + tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] tie_values = [self.tie_values[0]] tie_lengths = [self.tie_lengths[0]] for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): From b07ab602abd9c6c6b19ecc63f2309285b71eb617 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 09:33:05 +0100 Subject: [PATCH 095/104] Improve things and fix tolerance bug for sampled. --- xdas/coordinates/core.py | 6 +++++- xdas/coordinates/sampled.py | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 2e81bb86..f77bafe1 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -592,13 +592,17 @@ def parse(data, dim=None): def parse_tolerance(tolerance, dtype): if np.issubdtype(dtype, np.datetime64): + unit, step = np.datetime_data(dtype) + dtype = np.dtype(f"timedelta64[{step}{unit}]") if tolerance is None: tolerance = np.timedelta64(0, "ns") elif isinstance(tolerance, (int, float)): tolerance = np.timedelta64(round(tolerance * 1e9), "ns") else: if tolerance is None: - tolerance = 0.0 + tolerance = np.array(0, dtype=dtype)[()] + else: + tolerance = np.array(tolerance, dtype=dtype)[()] return tolerance diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index a1b47e3a..d1235d70 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -391,7 +391,6 @@ def decimate(self, q): def simplify(self, tolerance=None): tolerance = parse_tolerance(tolerance, self.dtype) - tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] tie_values = [self.tie_values[0]] tie_lengths = [self.tie_lengths[0]] for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): From 49b8aad7e65e5a9cd0eaff91028c8d58ff8fe6cb Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 09:46:02 +0100 Subject: [PATCH 096/104] Test simplify methods. --- tests/coordinates/test_interp.py | 15 +++++++++++++++ tests/coordinates/test_sampled.py | 17 +++++++++++++++++ xdas/coordinates/core.py | 6 +----- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py index f9ef27a1..bdd65e97 100644 --- a/tests/coordinates/test_interp.py +++ b/tests/coordinates/test_interp.py @@ -285,6 +285,21 @@ def test_simplify(self): coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) assert len(coord.simplify(1.0).tie_indices) == 2 + def test_simplify_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + xp = np.sort(np.random.choice(10000, 1000, replace=False)) + xp[0] = 0 + xp[-1] = 10000 + yp = ( + t0 + + xp.astype("timedelta64[s]") + + np.random.randint(-500, 500, size=1000).astype("timedelta64[ms]") + ) + coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) + assert len(coord.simplify(np.timedelta64(1, "s")).tie_indices) == 2 + assert len(coord.simplify(np.timedelta64(1000, "ms")).tie_indices) == 2 + assert len(coord.simplify(1.0).tie_indices) == 2 + def test_singleton(self): coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]}) assert coord[0].values == 1.0 diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 82d083d5..29e63ce6 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -678,6 +678,23 @@ def test_simplify_with_tolerance(self): result = coord.simplify(tolerance=0.1) assert np.all(np.abs(result.values - coord.values) <= 0.1) + def test_simplify_with_tolerance_on_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + jitter = np.random.rand(100) * 0.2 - 0.1 + jitter = jitter.astype("timedelta64[ms]") # convert to timedelta + coord = SampledCoordinate( + { + "tie_values": t0 + 10 * np.arange(100) + jitter, + "tie_lengths": 10 * np.ones(100, dtype=int), + "sampling_interval": np.timedelta64(1, "s"), + } + ) + result = coord.simplify(tolerance=np.timedelta64(200, "ms")) + assert len(result.tie_values) == 1 + # float tolerance should be treated as seconds + result = coord.simplify(tolerance=0.2) + assert len(result.tie_values) == 1 + class TestSampledCoordinateGetIndexer: def make_coord(self): diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index f77bafe1..1f970bb6 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -592,17 +592,13 @@ def parse(data, dim=None): def parse_tolerance(tolerance, dtype): if np.issubdtype(dtype, np.datetime64): - unit, step = np.datetime_data(dtype) - dtype = np.dtype(f"timedelta64[{step}{unit}]") if tolerance is None: tolerance = np.timedelta64(0, "ns") elif isinstance(tolerance, (int, float)): tolerance = np.timedelta64(round(tolerance * 1e9), "ns") else: if tolerance is None: - tolerance = np.array(0, dtype=dtype)[()] - else: - tolerance = np.array(tolerance, dtype=dtype)[()] + tolerance = 0 return tolerance From d826375d806bf017baaad0590de255897a5007d5 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 10:03:57 +0100 Subject: [PATCH 097/104] Make split work with float tolerance and test it. --- tests/coordinates/test_sampled.py | 24 +++++++++++-- tests/test_core.py | 57 ++++++++++++++++++++++--------- xdas/coordinates/interp.py | 1 + xdas/coordinates/sampled.py | 1 + 4 files changed, 64 insertions(+), 19 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 29e63ce6..9ef0467c 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -874,7 +874,7 @@ def test_to_netcdf_and_back(self): class TestGetSplitIndices: - def test_get_split_indices_no_tolerance(self): + def test_no_tolerance(self): coord = SampledCoordinate( {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} ) @@ -882,7 +882,7 @@ def test_get_split_indices_no_tolerance(self): expected = np.array([3]) # indices where segments end assert np.array_equal(div_points, expected) - def test_get_split_indices_with_tolerance(self): + def test_with_tolerance(self): coord = SampledCoordinate( { "tie_values": [0.0, 3.1, 10.0], @@ -894,6 +894,26 @@ def test_get_split_indices_with_tolerance(self): expected = np.array([5]) # only the second gap exceeds tolerance assert np.array_equal(div_points, expected) + def test_with_tolerance_on_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + coord = SampledCoordinate( + { + "tie_values": [ + t0, + t0 + np.timedelta64(3, "s") + np.timedelta64(100, "ms"), + t0 + np.timedelta64(10, "s"), + ], + "tie_lengths": [3, 2, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + div_points = coord.get_split_indices(tolerance=np.timedelta64(200, "ms")) + expected = np.array([5]) # only the second gap exceeds tolerance + assert np.array_equal(div_points, expected) + # float tolerance should be treated as seconds + div_points = coord.get_split_indices(tolerance=0.2) + assert np.array_equal(div_points, expected) + class TestFromBlock: def test_from_block(self): diff --git a/tests/test_core.py b/tests/test_core.py index 51c33b2f..ac4237fe 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -211,23 +211,6 @@ def test_asdataarray(self): for dim in da.dims: assert np.array_equal(out[dim].values, da[dim].values) - def test_split(self): - da = xd.DataArray( - np.ones(30), - { - "time": { - "tie_indices": [0, 9, 10, 19, 20, 29], - "tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0], - }, - }, - ) - assert xd.concatenate(xd.split(da)).equals(da) - assert xd.split(da, tolerance=20.0)[0].equals(da) - - def test_chunk(self): - da = wavelet_wavefronts() - assert xd.concatenate(xd.split(da, 3)).equals(da) - def test_align(self): da1 = xd.DataArray(np.arange(2), {"x": [0, 1]}) da2 = xd.DataArray(np.arange(3), {"y": [2, 3, 4]}) @@ -240,3 +223,43 @@ def test_align(self): da3 = xd.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]}) with pytest.raises(ValueError, match="differs from one data array to another"): xd.align(da1, da2, da3) + + +class TestSplit: + def test_integer(self): + da = wavelet_wavefronts() + assert xd.concatenate(xd.split(da, 3)).equals(da) + + def test_interp(self): + da = xd.DataArray( + np.ones(30), + { + "time": { + "tie_indices": [0, 9, 10, 19, 20, 29], + "tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0], + }, + }, + ) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) + + def test_interp_datetime(self): + da = xd.DataArray( + np.ones(30), + { + "time": { + "tie_indices": [0, 9, 10, 19, 20, 29], + "tie_values": [ + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-01T00:00:09"), + np.datetime64("2000-01-01T00:00:20"), + np.datetime64("2000-01-01T00:00:29"), + np.datetime64("2000-01-01T00:00:40"), + np.datetime64("2000-01-01T00:00:49"), + ], + }, + }, + ) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=np.timedelta64(20, "s"))[0].equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 46b8e31d..3361d7c1 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -304,6 +304,7 @@ def get_split_indices(self, tolerance=None): (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) indices += 1 if tolerance is not None: + tolerance = parse_tolerance(tolerance, self.dtype) deltas = self.tie_values[indices + 1] - self.tie_values[indices] indices = indices[np.abs(deltas) >= tolerance] return np.array( diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index d1235d70..cf40511b 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -412,6 +412,7 @@ def simplify(self, tolerance=None): def get_split_indices(self, tolerance=None): indices = self.tie_indices[1:] if tolerance is not None: + tolerance = parse_tolerance(tolerance, self.dtype) deltas = self.tie_values[1:] - ( self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] ) From 363401ca6f7479cd0b40b9fd30054593192b2784 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 10:06:47 +0100 Subject: [PATCH 098/104] Add relase note. --- docs/release-notes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes.md b/docs/release-notes.md index e9f92696..3e85c262 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -4,6 +4,7 @@ - Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). - Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). - Add support for multiple ROI for ASN engine (@martijnende). +- `tolerance` can now be passed as seconds for datetime64 coordinates (@martijnende, @atrabattoni) - Fix numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto). ## 0.2.4 From a3121dc9f390d2e9553dab00ae42b58ea5918bcb Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 11:28:03 +0100 Subject: [PATCH 099/104] update docstrings related to tolerance. --- xdas/core/routines.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index c3a75dcb..3784cfaa 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -40,7 +40,8 @@ def open_mfdatacollection( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -118,7 +119,8 @@ def open_mfdatatree( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -217,7 +219,8 @@ def collect( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -284,7 +287,8 @@ def open_mfdataarray( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -769,7 +773,8 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None The dimension along which to split, by default "first" tolerance : float or timedelta64, optional If `indices_or_sections="discontinuities"` split will only occur on gaps and - overlaps that are bigger than `tolerance`. Zero tolerance by default. + overlaps that are bigger than `tolerance`. For time coordinates, numeric + values are considered as seconds. Zero tolerance by default. Returns ------- From 49936336e564ef032b496d438d203de253858da6 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 11:32:34 +0100 Subject: [PATCH 100/104] updata missing tolerance docstring. --- xdas/core/routines.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 3784cfaa..587c955e 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -434,8 +434,9 @@ def asdataarray(obj, tolerance=None): obj : object The objected to convert tolerance : float or datetime64, optional - For dense coordinates, tolerance error for interpolation representation, by - default zero. + For dense coordinates, tolerance error for interpolation representation. + For time coordinates, numeric values are considered as seconds. + Zero by default. Returns ------- @@ -476,7 +477,8 @@ def combine_by_field( The dimension along which concatenate. Default to "first". tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following. For time coordinates, numeric values are considered as seconds. + Zero by default. squeeze : bool, optional Whether to return a Database instead of a DataCollection if the combinatison results in a data collection containing a unique Database. @@ -539,7 +541,8 @@ def combine_by_coords( The dimension along which concatenate. Default to "first". tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following. For time coordinates, numeric values are considered as seconds. + Zero by default. squeeze : bool, optional Whether to return a Database instead of a DataCollection if the combination results in a data collection containing a unique Database. @@ -677,7 +680,8 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): The dimension along which concatenate. tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following, For time coordinates, numeric values are considered as seconds. + Zero by default. virtual : bool, optional Whether to create a virtual dataset. It requires that all concatenated data arrays are virtual. By default tries to create a virtual dataset if possible. From d0ae59050821603661862d22bc229f0b96d2f0d2 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 12:06:47 +0100 Subject: [PATCH 101/104] Minor fix. --- docs/user-guide/coordinates/index.md | 4 ++-- xdas/coordinates/default.py | 2 +- xdas/fft.py | 10 +++++----- xdas/picking.py | 4 +++- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/docs/user-guide/coordinates/index.md b/docs/user-guide/coordinates/index.md index 62cb7a4f..1894df98 100644 --- a/docs/user-guide/coordinates/index.md +++ b/docs/user-guide/coordinates/index.md @@ -7,7 +7,7 @@ kernelspec: # Coordinates -{py:class}`~xdas.DataArray` is the base class in *xdas*. It is mainly composed of a N-dimensional array and of a set of {py:class}`~xdas.Coordinate` objects that are gathered in a {py:class}`~xdas.Coordinates` dict-like object than can be accessed by the `DataArray.coords` attribute. Xdas comme with several flavours of {py:class}`~xdas.Coordinate` objects. +{py:class}`~xdas.DataArray` is the base class in *xdas*. It is mainly composed of a N-dimensional array and a set of {py:class}`~xdas.Coordinate` objects that are gathered in a {py:class}`~xdas.Coordinates` dict-like object that can be accessed by the `DataArray.coords` attribute. Xdas comes with several flavors of {py:class}`~xdas.Coordinate` objects. | Type | Description | `name` | `data` | |:---|:---|:---:|:---:| @@ -17,7 +17,7 @@ kernelspec: | {py:class}`~xdas.coordinates.InterpCoordinate` | Values are interpolated linearly between tie points | `interpolated` | `{"tie_indices": array-like[int], "tie_values": array-like[any]}` | | {py:class}`~xdas.coordinates.SampledCoordinate` | Values are given as a multiple of a fixed sampling interval and several start values | `sampled` | `{"tie_values": array-like[any], "tie_indices": array-like[int], "sampling_interval": any}` | -In the current state fo the documentation most of the coordinate information can be found in the [Interpolated Coordinate](interpolated-coordinates) page. +In the current state of the documentation, most coordinate information can be found on the [Interpolated Coordinates](interpolated-coordinates) page. ## Per type information diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py index df24191a..531c5cc6 100644 --- a/xdas/coordinates/default.py +++ b/xdas/coordinates/default.py @@ -27,7 +27,7 @@ def __init__(self, data=None, dim=None, dtype=None): @property def empty(self): - return bool(self.data["size"]) + return self.data["size"] == 0 @property def dtype(self): diff --git a/xdas/fft.py b/xdas/fft.py index bb47bb89..99cdfbb5 100644 --- a/xdas/fft.py +++ b/xdas/fft.py @@ -38,7 +38,7 @@ def fft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): Notes ----- - To perform a multidimensional fourrier operations, repeat this function on the + To perform a multidimensional Fourier operations, repeat this function on the desired dimensions. Examples @@ -106,7 +106,7 @@ def rfft(da, n=None, dim={"last": "spectrum"}, norm=None, parallel=None): Notes ----- - To perform a multidimensional fourrier operations, repeat this function on the + To perform a multidimensional Fourier operations, repeat this function on the desired dimensions. Examples @@ -156,7 +156,7 @@ def ifft(da, n=None, dim={"last": "signal"}, norm=None, parallel=None): dimension specified by `dim` is used. dim: {str: str}, optional A mapping indicating as a key the dimension along which to compute the IFFT, and - as value the new name of the dimension. Default to {"last": "time"}. + as value the new name of the dimension. Default to {"last": "signal"}. norm: {“backward”, “ortho”, “forward”}, optional Normalization mode (see `numpy.fft`). Default is "backward". Indicates which direction of the forward/backward pair of transforms is scaled and with what @@ -169,7 +169,7 @@ def ifft(da, n=None, dim={"last": "signal"}, norm=None, parallel=None): Notes ----- - To perform a multidimensional inverse fourrier operations, repeat this function on + To perform a multidimensional inverse Fourier operations, repeat this function on the desired dimensions. Examples @@ -237,7 +237,7 @@ def irfft(da, n=None, dim={"last": "signal"}, norm=None, parallel=None): Notes ----- - To perform a multidimensional fourrier operations, repeat this function on the + To perform a multidimensional Fourier operations, repeat this function on the desired dimensions. Examples diff --git a/xdas/picking.py b/xdas/picking.py index 76c329f6..bea3275e 100644 --- a/xdas/picking.py +++ b/xdas/picking.py @@ -103,8 +103,10 @@ def tapered_selection(da, start, end, window=None, size=None, dim="last"): } else: pass # skip non-dimensional coords for `dim` - else: + elif da[name].dim is not None: coords[name] = da[name][selection] + else: + coords[name] = da[name] # return output DataArray return xd.DataArray(data, coords=coords, dims=da.dims) From 8c8aa8a3e56e136fecdcd0561081411be618a127 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 12:08:45 +0100 Subject: [PATCH 102/104] Remove useless code. --- xdas/coordinates/default.py | 2 +- xdas/io/febus.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/xdas/coordinates/default.py b/xdas/coordinates/default.py index 531c5cc6..ab19b7c8 100644 --- a/xdas/coordinates/default.py +++ b/xdas/coordinates/default.py @@ -27,7 +27,7 @@ def __init__(self, data=None, dim=None, dtype=None): @property def empty(self): - return self.data["size"] == 0 + return self.data["size"] == 0 @property def dtype(self): diff --git a/xdas/io/febus.py b/xdas/io/febus.py index e1c2f84d..c57d1da0 100644 --- a/xdas/io/febus.py +++ b/xdas/io/febus.py @@ -102,7 +102,3 @@ def read(fname, overlaps=None, offset=None, ctype=None): dc.append(da) return concatenate(dc, "time") - - -def _to_datetime64(data): - return From 1ebd1f26ef36a960f405289b5d7f9ebebd3e8c20 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 12:12:56 +0100 Subject: [PATCH 103/104] Fix doc link. --- docs/getting-started.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/getting-started.md b/docs/getting-started.md index f717052a..83f398a4 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -63,7 +63,7 @@ da = xdas.open_mfdataarray("00*.h5", engine=None) da ``` -Xdas only loads the metadata from each file and returns a {py:class}`~xdas.DataArray` object. This object has mainly two attributes. First a `data` attribute that contain the data. Here a {py:class}`~xdas.VirtualStack` object that is a pointer to the different files we opened. Second, a `coords` attribute that contains the metadata related to how the space and the time are sampled. Here both dimensions are labeled using {py:class}`~xdas.InterpCoordinate` objects. Those allow to concisely store the time and space information, including potential gaps and overlaps. See the [](user-guide/interpolated-coordinates) section for more information. +Xdas only loads the metadata from each file and returns a {py:class}`~xdas.DataArray` object. This object has mainly two attributes. First a `data` attribute that contain the data. Here a {py:class}`~xdas.VirtualStack` object that is a pointer to the different files we opened. Second, a `coords` attribute that contains the metadata related to how the space and the time are sampled. Here both dimensions are labeled using {py:class}`~xdas.InterpCoordinate` objects. Those allow to concisely store the time and space information, including potential gaps and overlaps. See the [](user-guide/coordinates/interpolated-coordinates) section for more information. Note that if you want to create a single data collection object for multiple acquisitions (i.e. different instruments or several acquisition with different parameters), you can use the [DataCollection](user-guide/data-structures/datacollection) structure. From 7dc0529222a23791534ea8374e073cf5124c374a Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 13:43:44 +0100 Subject: [PATCH 104/104] Perfect release note. --- docs/release-notes.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/release-notes.md b/docs/release-notes.md index 3e85c262..dfbad0ee 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -1,11 +1,12 @@ # Release notes ## 0.2.5 -- Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). +- Add SampleCoordinate for more SEED-like coordinates and refactor the coordinate backend (@atrabattoni). +- Add `xdas.picking.tapered_selection` to extract windows around picks (@atrabattoni). - Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). - Add support for multiple ROI for ASN engine (@martijnende). - `tolerance` can now be passed as seconds for datetime64 coordinates (@martijnende, @atrabattoni) -- Fix numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto). +- Add suppport for python 3.14, numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto). ## 0.2.4 - Add StreamWriter to write long time series to miniSEED (@marbail).