From d327bcc93b90a5f8a48e5a8709d7ced64f6138e6 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Thu, 5 Mar 2026 18:58:40 +0100 Subject: [PATCH 1/7] WIP. --- xdas/coordinates/core.py | 12 ++++++++++++ xdas/coordinates/interp.py | 14 ++++++++------ xdas/coordinates/sampled.py | 12 +++++++++--- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 4154467d..2e81bb86 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -590,6 +590,18 @@ def parse(data, dim=None): return data, dim +def parse_tolerance(tolerance, dtype): + if np.issubdtype(dtype, np.datetime64): + if tolerance is None: + tolerance = np.timedelta64(0, "ns") + elif isinstance(tolerance, (int, float)): + tolerance = np.timedelta64(round(tolerance * 1e9), "ns") + else: + if tolerance is None: + tolerance = 0.0 + return tolerance + + def get_sampling_interval(da, dim, cast=True): """ Returns the sample spacing along a given dimension. diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 741e4b43..46b8e31d 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -4,7 +4,13 @@ import pandas as pd from xinterp import forward, inverse -from .core import Coordinate, format_datetime, is_strictly_increasing, parse +from .core import ( + Coordinate, + format_datetime, + is_strictly_increasing, + parse, + parse_tolerance, +) class InterpCoordinate(Coordinate, name="interpolated"): @@ -286,11 +292,7 @@ def decimate(self, q): ) def simplify(self, tolerance=None): - if tolerance is None: - if np.issubdtype(self.dtype, np.datetime64): - tolerance = np.timedelta64(0, "ns") - else: - tolerance = 0.0 + tolerance = parse_tolerance(tolerance, self.dtype) tie_indices, tie_values = douglas_peucker( self.tie_indices, self.tie_values, tolerance ) diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 34485eaf..a1b47e3a 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -2,7 +2,13 @@ import numpy as np -from .core import Coordinate, format_datetime, is_strictly_increasing, parse +from .core import ( + Coordinate, + format_datetime, + is_strictly_increasing, + parse, + parse_tolerance, +) CODE_TO_UNITS = { "h": "hours", @@ -384,8 +390,8 @@ def decimate(self, q): return self[::q] def simplify(self, tolerance=None): - if tolerance is None: - tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] + tolerance = parse_tolerance(tolerance, self.dtype) + tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] tie_values = [self.tie_values[0]] tie_lengths = [self.tie_lengths[0]] for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): From b07ab602abd9c6c6b19ecc63f2309285b71eb617 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 09:33:05 +0100 Subject: [PATCH 2/7] Improve things and fix tolerance bug for sampled. --- xdas/coordinates/core.py | 6 +++++- xdas/coordinates/sampled.py | 1 - 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 2e81bb86..f77bafe1 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -592,13 +592,17 @@ def parse(data, dim=None): def parse_tolerance(tolerance, dtype): if np.issubdtype(dtype, np.datetime64): + unit, step = np.datetime_data(dtype) + dtype = np.dtype(f"timedelta64[{step}{unit}]") if tolerance is None: tolerance = np.timedelta64(0, "ns") elif isinstance(tolerance, (int, float)): tolerance = np.timedelta64(round(tolerance * 1e9), "ns") else: if tolerance is None: - tolerance = 0.0 + tolerance = np.array(0, dtype=dtype)[()] + else: + tolerance = np.array(tolerance, dtype=dtype)[()] return tolerance diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index a1b47e3a..d1235d70 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -391,7 +391,6 @@ def decimate(self, q): def simplify(self, tolerance=None): tolerance = parse_tolerance(tolerance, self.dtype) - tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] tie_values = [self.tie_values[0]] tie_lengths = [self.tie_lengths[0]] for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): From 49b8aad7e65e5a9cd0eaff91028c8d58ff8fe6cb Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 09:46:02 +0100 Subject: [PATCH 3/7] Test simplify methods. --- tests/coordinates/test_interp.py | 15 +++++++++++++++ tests/coordinates/test_sampled.py | 17 +++++++++++++++++ xdas/coordinates/core.py | 6 +----- 3 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py index f9ef27a1..bdd65e97 100644 --- a/tests/coordinates/test_interp.py +++ b/tests/coordinates/test_interp.py @@ -285,6 +285,21 @@ def test_simplify(self): coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) assert len(coord.simplify(1.0).tie_indices) == 2 + def test_simplify_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + xp = np.sort(np.random.choice(10000, 1000, replace=False)) + xp[0] = 0 + xp[-1] = 10000 + yp = ( + t0 + + xp.astype("timedelta64[s]") + + np.random.randint(-500, 500, size=1000).astype("timedelta64[ms]") + ) + coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) + assert len(coord.simplify(np.timedelta64(1, "s")).tie_indices) == 2 + assert len(coord.simplify(np.timedelta64(1000, "ms")).tie_indices) == 2 + assert len(coord.simplify(1.0).tie_indices) == 2 + def test_singleton(self): coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]}) assert coord[0].values == 1.0 diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 82d083d5..29e63ce6 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -678,6 +678,23 @@ def test_simplify_with_tolerance(self): result = coord.simplify(tolerance=0.1) assert np.all(np.abs(result.values - coord.values) <= 0.1) + def test_simplify_with_tolerance_on_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + jitter = np.random.rand(100) * 0.2 - 0.1 + jitter = jitter.astype("timedelta64[ms]") # convert to timedelta + coord = SampledCoordinate( + { + "tie_values": t0 + 10 * np.arange(100) + jitter, + "tie_lengths": 10 * np.ones(100, dtype=int), + "sampling_interval": np.timedelta64(1, "s"), + } + ) + result = coord.simplify(tolerance=np.timedelta64(200, "ms")) + assert len(result.tie_values) == 1 + # float tolerance should be treated as seconds + result = coord.simplify(tolerance=0.2) + assert len(result.tie_values) == 1 + class TestSampledCoordinateGetIndexer: def make_coord(self): diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index f77bafe1..1f970bb6 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -592,17 +592,13 @@ def parse(data, dim=None): def parse_tolerance(tolerance, dtype): if np.issubdtype(dtype, np.datetime64): - unit, step = np.datetime_data(dtype) - dtype = np.dtype(f"timedelta64[{step}{unit}]") if tolerance is None: tolerance = np.timedelta64(0, "ns") elif isinstance(tolerance, (int, float)): tolerance = np.timedelta64(round(tolerance * 1e9), "ns") else: if tolerance is None: - tolerance = np.array(0, dtype=dtype)[()] - else: - tolerance = np.array(tolerance, dtype=dtype)[()] + tolerance = 0 return tolerance From d826375d806bf017baaad0590de255897a5007d5 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 10:03:57 +0100 Subject: [PATCH 4/7] Make split work with float tolerance and test it. --- tests/coordinates/test_sampled.py | 24 +++++++++++-- tests/test_core.py | 57 ++++++++++++++++++++++--------- xdas/coordinates/interp.py | 1 + xdas/coordinates/sampled.py | 1 + 4 files changed, 64 insertions(+), 19 deletions(-) diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 29e63ce6..9ef0467c 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -874,7 +874,7 @@ def test_to_netcdf_and_back(self): class TestGetSplitIndices: - def test_get_split_indices_no_tolerance(self): + def test_no_tolerance(self): coord = SampledCoordinate( {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} ) @@ -882,7 +882,7 @@ def test_get_split_indices_no_tolerance(self): expected = np.array([3]) # indices where segments end assert np.array_equal(div_points, expected) - def test_get_split_indices_with_tolerance(self): + def test_with_tolerance(self): coord = SampledCoordinate( { "tie_values": [0.0, 3.1, 10.0], @@ -894,6 +894,26 @@ def test_get_split_indices_with_tolerance(self): expected = np.array([5]) # only the second gap exceeds tolerance assert np.array_equal(div_points, expected) + def test_with_tolerance_on_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + coord = SampledCoordinate( + { + "tie_values": [ + t0, + t0 + np.timedelta64(3, "s") + np.timedelta64(100, "ms"), + t0 + np.timedelta64(10, "s"), + ], + "tie_lengths": [3, 2, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + div_points = coord.get_split_indices(tolerance=np.timedelta64(200, "ms")) + expected = np.array([5]) # only the second gap exceeds tolerance + assert np.array_equal(div_points, expected) + # float tolerance should be treated as seconds + div_points = coord.get_split_indices(tolerance=0.2) + assert np.array_equal(div_points, expected) + class TestFromBlock: def test_from_block(self): diff --git a/tests/test_core.py b/tests/test_core.py index 51c33b2f..ac4237fe 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -211,23 +211,6 @@ def test_asdataarray(self): for dim in da.dims: assert np.array_equal(out[dim].values, da[dim].values) - def test_split(self): - da = xd.DataArray( - np.ones(30), - { - "time": { - "tie_indices": [0, 9, 10, 19, 20, 29], - "tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0], - }, - }, - ) - assert xd.concatenate(xd.split(da)).equals(da) - assert xd.split(da, tolerance=20.0)[0].equals(da) - - def test_chunk(self): - da = wavelet_wavefronts() - assert xd.concatenate(xd.split(da, 3)).equals(da) - def test_align(self): da1 = xd.DataArray(np.arange(2), {"x": [0, 1]}) da2 = xd.DataArray(np.arange(3), {"y": [2, 3, 4]}) @@ -240,3 +223,43 @@ def test_align(self): da3 = xd.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]}) with pytest.raises(ValueError, match="differs from one data array to another"): xd.align(da1, da2, da3) + + +class TestSplit: + def test_integer(self): + da = wavelet_wavefronts() + assert xd.concatenate(xd.split(da, 3)).equals(da) + + def test_interp(self): + da = xd.DataArray( + np.ones(30), + { + "time": { + "tie_indices": [0, 9, 10, 19, 20, 29], + "tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0], + }, + }, + ) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) + + def test_interp_datetime(self): + da = xd.DataArray( + np.ones(30), + { + "time": { + "tie_indices": [0, 9, 10, 19, 20, 29], + "tie_values": [ + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-01T00:00:09"), + np.datetime64("2000-01-01T00:00:20"), + np.datetime64("2000-01-01T00:00:29"), + np.datetime64("2000-01-01T00:00:40"), + np.datetime64("2000-01-01T00:00:49"), + ], + }, + }, + ) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=np.timedelta64(20, "s"))[0].equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 46b8e31d..3361d7c1 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -304,6 +304,7 @@ def get_split_indices(self, tolerance=None): (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) indices += 1 if tolerance is not None: + tolerance = parse_tolerance(tolerance, self.dtype) deltas = self.tie_values[indices + 1] - self.tie_values[indices] indices = indices[np.abs(deltas) >= tolerance] return np.array( diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index d1235d70..cf40511b 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -412,6 +412,7 @@ def simplify(self, tolerance=None): def get_split_indices(self, tolerance=None): indices = self.tie_indices[1:] if tolerance is not None: + tolerance = parse_tolerance(tolerance, self.dtype) deltas = self.tie_values[1:] - ( self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] ) From 363401ca6f7479cd0b40b9fd30054593192b2784 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 10:06:47 +0100 Subject: [PATCH 5/7] Add relase note. --- docs/release-notes.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes.md b/docs/release-notes.md index e9f92696..3e85c262 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -4,6 +4,7 @@ - Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). - Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). - Add support for multiple ROI for ASN engine (@martijnende). +- `tolerance` can now be passed as seconds for datetime64 coordinates (@martijnende, @atrabattoni) - Fix numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto). ## 0.2.4 From a3121dc9f390d2e9553dab00ae42b58ea5918bcb Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 11:28:03 +0100 Subject: [PATCH 6/7] update docstrings related to tolerance. --- xdas/core/routines.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index c3a75dcb..3784cfaa 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -40,7 +40,8 @@ def open_mfdatacollection( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -118,7 +119,8 @@ def open_mfdatatree( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -217,7 +219,8 @@ def collect( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -284,7 +287,8 @@ def open_mfdataarray( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -769,7 +773,8 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None The dimension along which to split, by default "first" tolerance : float or timedelta64, optional If `indices_or_sections="discontinuities"` split will only occur on gaps and - overlaps that are bigger than `tolerance`. Zero tolerance by default. + overlaps that are bigger than `tolerance`. For time coordinates, numeric + values are considered as seconds. Zero tolerance by default. Returns ------- From 49936336e564ef032b496d438d203de253858da6 Mon Sep 17 00:00:00 2001 From: Alister Trabattoni Date: Fri, 6 Mar 2026 11:32:34 +0100 Subject: [PATCH 7/7] updata missing tolerance docstring. --- xdas/core/routines.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index 3784cfaa..587c955e 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -434,8 +434,9 @@ def asdataarray(obj, tolerance=None): obj : object The objected to convert tolerance : float or datetime64, optional - For dense coordinates, tolerance error for interpolation representation, by - default zero. + For dense coordinates, tolerance error for interpolation representation. + For time coordinates, numeric values are considered as seconds. + Zero by default. Returns ------- @@ -476,7 +477,8 @@ def combine_by_field( The dimension along which concatenate. Default to "first". tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following. For time coordinates, numeric values are considered as seconds. + Zero by default. squeeze : bool, optional Whether to return a Database instead of a DataCollection if the combinatison results in a data collection containing a unique Database. @@ -539,7 +541,8 @@ def combine_by_coords( The dimension along which concatenate. Default to "first". tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following. For time coordinates, numeric values are considered as seconds. + Zero by default. squeeze : bool, optional Whether to return a Database instead of a DataCollection if the combination results in a data collection containing a unique Database. @@ -677,7 +680,8 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): The dimension along which concatenate. tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following, For time coordinates, numeric values are considered as seconds. + Zero by default. virtual : bool, optional Whether to create a virtual dataset. It requires that all concatenated data arrays are virtual. By default tries to create a virtual dataset if possible.