diff --git a/docs/release-notes.md b/docs/release-notes.md index e9f92696..3e85c262 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -4,6 +4,7 @@ - Add SampleCoordinate for more SEED-like coordinates (@atrabattoni). - Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco). - Add support for multiple ROI for ASN engine (@martijnende). +- `tolerance` can now be passed as seconds for datetime64 coordinates (@martijnende, @atrabattoni) - Fix numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto). ## 0.2.4 diff --git a/tests/coordinates/test_interp.py b/tests/coordinates/test_interp.py index f9ef27a1..bdd65e97 100644 --- a/tests/coordinates/test_interp.py +++ b/tests/coordinates/test_interp.py @@ -285,6 +285,21 @@ def test_simplify(self): coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) assert len(coord.simplify(1.0).tie_indices) == 2 + def test_simplify_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + xp = np.sort(np.random.choice(10000, 1000, replace=False)) + xp[0] = 0 + xp[-1] = 10000 + yp = ( + t0 + + xp.astype("timedelta64[s]") + + np.random.randint(-500, 500, size=1000).astype("timedelta64[ms]") + ) + coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp}) + assert len(coord.simplify(np.timedelta64(1, "s")).tie_indices) == 2 + assert len(coord.simplify(np.timedelta64(1000, "ms")).tie_indices) == 2 + assert len(coord.simplify(1.0).tie_indices) == 2 + def test_singleton(self): coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]}) assert coord[0].values == 1.0 diff --git a/tests/coordinates/test_sampled.py b/tests/coordinates/test_sampled.py index 82d083d5..9ef0467c 100644 --- a/tests/coordinates/test_sampled.py +++ b/tests/coordinates/test_sampled.py @@ -678,6 +678,23 @@ def test_simplify_with_tolerance(self): result = coord.simplify(tolerance=0.1) assert np.all(np.abs(result.values - coord.values) <= 0.1) + def test_simplify_with_tolerance_on_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + jitter = np.random.rand(100) * 0.2 - 0.1 + jitter = jitter.astype("timedelta64[ms]") # convert to timedelta + coord = SampledCoordinate( + { + "tie_values": t0 + 10 * np.arange(100) + jitter, + "tie_lengths": 10 * np.ones(100, dtype=int), + "sampling_interval": np.timedelta64(1, "s"), + } + ) + result = coord.simplify(tolerance=np.timedelta64(200, "ms")) + assert len(result.tie_values) == 1 + # float tolerance should be treated as seconds + result = coord.simplify(tolerance=0.2) + assert len(result.tie_values) == 1 + class TestSampledCoordinateGetIndexer: def make_coord(self): @@ -857,7 +874,7 @@ def test_to_netcdf_and_back(self): class TestGetSplitIndices: - def test_get_split_indices_no_tolerance(self): + def test_no_tolerance(self): coord = SampledCoordinate( {"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0} ) @@ -865,7 +882,7 @@ def test_get_split_indices_no_tolerance(self): expected = np.array([3]) # indices where segments end assert np.array_equal(div_points, expected) - def test_get_split_indices_with_tolerance(self): + def test_with_tolerance(self): coord = SampledCoordinate( { "tie_values": [0.0, 3.1, 10.0], @@ -877,6 +894,26 @@ def test_get_split_indices_with_tolerance(self): expected = np.array([5]) # only the second gap exceeds tolerance assert np.array_equal(div_points, expected) + def test_with_tolerance_on_datetime(self): + t0 = np.datetime64("2000-01-01T00:00:00") + coord = SampledCoordinate( + { + "tie_values": [ + t0, + t0 + np.timedelta64(3, "s") + np.timedelta64(100, "ms"), + t0 + np.timedelta64(10, "s"), + ], + "tie_lengths": [3, 2, 2], + "sampling_interval": np.timedelta64(1, "s"), + } + ) + div_points = coord.get_split_indices(tolerance=np.timedelta64(200, "ms")) + expected = np.array([5]) # only the second gap exceeds tolerance + assert np.array_equal(div_points, expected) + # float tolerance should be treated as seconds + div_points = coord.get_split_indices(tolerance=0.2) + assert np.array_equal(div_points, expected) + class TestFromBlock: def test_from_block(self): diff --git a/tests/test_core.py b/tests/test_core.py index 51c33b2f..ac4237fe 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -211,23 +211,6 @@ def test_asdataarray(self): for dim in da.dims: assert np.array_equal(out[dim].values, da[dim].values) - def test_split(self): - da = xd.DataArray( - np.ones(30), - { - "time": { - "tie_indices": [0, 9, 10, 19, 20, 29], - "tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0], - }, - }, - ) - assert xd.concatenate(xd.split(da)).equals(da) - assert xd.split(da, tolerance=20.0)[0].equals(da) - - def test_chunk(self): - da = wavelet_wavefronts() - assert xd.concatenate(xd.split(da, 3)).equals(da) - def test_align(self): da1 = xd.DataArray(np.arange(2), {"x": [0, 1]}) da2 = xd.DataArray(np.arange(3), {"y": [2, 3, 4]}) @@ -240,3 +223,43 @@ def test_align(self): da3 = xd.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]}) with pytest.raises(ValueError, match="differs from one data array to another"): xd.align(da1, da2, da3) + + +class TestSplit: + def test_integer(self): + da = wavelet_wavefronts() + assert xd.concatenate(xd.split(da, 3)).equals(da) + + def test_interp(self): + da = xd.DataArray( + np.ones(30), + { + "time": { + "tie_indices": [0, 9, 10, 19, 20, 29], + "tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0], + }, + }, + ) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) + + def test_interp_datetime(self): + da = xd.DataArray( + np.ones(30), + { + "time": { + "tie_indices": [0, 9, 10, 19, 20, 29], + "tie_values": [ + np.datetime64("2000-01-01T00:00:00"), + np.datetime64("2000-01-01T00:00:09"), + np.datetime64("2000-01-01T00:00:20"), + np.datetime64("2000-01-01T00:00:29"), + np.datetime64("2000-01-01T00:00:40"), + np.datetime64("2000-01-01T00:00:49"), + ], + }, + }, + ) + assert xd.concatenate(xd.split(da)).equals(da) + assert xd.split(da, tolerance=np.timedelta64(20, "s"))[0].equals(da) + assert xd.split(da, tolerance=20.0)[0].equals(da) diff --git a/xdas/coordinates/core.py b/xdas/coordinates/core.py index 4154467d..1f970bb6 100644 --- a/xdas/coordinates/core.py +++ b/xdas/coordinates/core.py @@ -590,6 +590,18 @@ def parse(data, dim=None): return data, dim +def parse_tolerance(tolerance, dtype): + if np.issubdtype(dtype, np.datetime64): + if tolerance is None: + tolerance = np.timedelta64(0, "ns") + elif isinstance(tolerance, (int, float)): + tolerance = np.timedelta64(round(tolerance * 1e9), "ns") + else: + if tolerance is None: + tolerance = 0 + return tolerance + + def get_sampling_interval(da, dim, cast=True): """ Returns the sample spacing along a given dimension. diff --git a/xdas/coordinates/interp.py b/xdas/coordinates/interp.py index 741e4b43..3361d7c1 100644 --- a/xdas/coordinates/interp.py +++ b/xdas/coordinates/interp.py @@ -4,7 +4,13 @@ import pandas as pd from xinterp import forward, inverse -from .core import Coordinate, format_datetime, is_strictly_increasing, parse +from .core import ( + Coordinate, + format_datetime, + is_strictly_increasing, + parse, + parse_tolerance, +) class InterpCoordinate(Coordinate, name="interpolated"): @@ -286,11 +292,7 @@ def decimate(self, q): ) def simplify(self, tolerance=None): - if tolerance is None: - if np.issubdtype(self.dtype, np.datetime64): - tolerance = np.timedelta64(0, "ns") - else: - tolerance = 0.0 + tolerance = parse_tolerance(tolerance, self.dtype) tie_indices, tie_values = douglas_peucker( self.tie_indices, self.tie_values, tolerance ) @@ -302,6 +304,7 @@ def get_split_indices(self, tolerance=None): (indices,) = np.nonzero(np.diff(self.tie_indices) == 1) indices += 1 if tolerance is not None: + tolerance = parse_tolerance(tolerance, self.dtype) deltas = self.tie_values[indices + 1] - self.tie_values[indices] indices = indices[np.abs(deltas) >= tolerance] return np.array( diff --git a/xdas/coordinates/sampled.py b/xdas/coordinates/sampled.py index 34485eaf..cf40511b 100644 --- a/xdas/coordinates/sampled.py +++ b/xdas/coordinates/sampled.py @@ -2,7 +2,13 @@ import numpy as np -from .core import Coordinate, format_datetime, is_strictly_increasing, parse +from .core import ( + Coordinate, + format_datetime, + is_strictly_increasing, + parse, + parse_tolerance, +) CODE_TO_UNITS = { "h": "hours", @@ -384,8 +390,7 @@ def decimate(self, q): return self[::q] def simplify(self, tolerance=None): - if tolerance is None: - tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()] + tolerance = parse_tolerance(tolerance, self.dtype) tie_values = [self.tie_values[0]] tie_lengths = [self.tie_lengths[0]] for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]): @@ -407,6 +412,7 @@ def simplify(self, tolerance=None): def get_split_indices(self, tolerance=None): indices = self.tie_indices[1:] if tolerance is not None: + tolerance = parse_tolerance(tolerance, self.dtype) deltas = self.tie_values[1:] - ( self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1] ) diff --git a/xdas/core/routines.py b/xdas/core/routines.py index c3a75dcb..587c955e 100644 --- a/xdas/core/routines.py +++ b/xdas/core/routines.py @@ -40,7 +40,8 @@ def open_mfdatacollection( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -118,7 +119,8 @@ def open_mfdatatree( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -217,7 +219,8 @@ def collect( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -284,7 +287,8 @@ def open_mfdataarray( The dimension along which the data arrays are concatenated. Default to "first". tolerance : float of timedelta64, optional During concatenation, the tolerance to consider that the end of a file is - continuous with beginning of the following one. Default to zero tolerance. + continuous with beginning of the following one. For time coordinates, numeric + values are considered as seconds. Default to zero tolerance. squeeze : bool, optional Whether to return a DataArray instead of a DataCollection if the combination results in a data collection containing a unique data array. @@ -430,8 +434,9 @@ def asdataarray(obj, tolerance=None): obj : object The objected to convert tolerance : float or datetime64, optional - For dense coordinates, tolerance error for interpolation representation, by - default zero. + For dense coordinates, tolerance error for interpolation representation. + For time coordinates, numeric values are considered as seconds. + Zero by default. Returns ------- @@ -472,7 +477,8 @@ def combine_by_field( The dimension along which concatenate. Default to "first". tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following. For time coordinates, numeric values are considered as seconds. + Zero by default. squeeze : bool, optional Whether to return a Database instead of a DataCollection if the combinatison results in a data collection containing a unique Database. @@ -535,7 +541,8 @@ def combine_by_coords( The dimension along which concatenate. Default to "first". tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following. For time coordinates, numeric values are considered as seconds. + Zero by default. squeeze : bool, optional Whether to return a Database instead of a DataCollection if the combination results in a data collection containing a unique Database. @@ -673,7 +680,8 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None): The dimension along which concatenate. tolerance : float of timedelta64, optional The tolerance to consider that the end of a file is continuous with beginning of - the following, zero by default. + the following, For time coordinates, numeric values are considered as seconds. + Zero by default. virtual : bool, optional Whether to create a virtual dataset. It requires that all concatenated data arrays are virtual. By default tries to create a virtual dataset if possible. @@ -769,7 +777,8 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None The dimension along which to split, by default "first" tolerance : float or timedelta64, optional If `indices_or_sections="discontinuities"` split will only occur on gaps and - overlaps that are bigger than `tolerance`. Zero tolerance by default. + overlaps that are bigger than `tolerance`. For time coordinates, numeric + values are considered as seconds. Zero tolerance by default. Returns -------