Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
- Add SampleCoordinate for more SEED-like coordinates (@atrabattoni).
- Add `create_dirs` to `.to_netcdf` methods to create intermediate directories (@aurelienfalco).
- Add support for multiple ROI for ASN engine (@martijnende).
- `tolerance` can now be passed as seconds for datetime64 coordinates (@martijnende, @atrabattoni)
- Fix numpy 2.4 and obspy 1.4.2 incompatibilities and add `xdas.__version__` (@atrabatto).

## 0.2.4
Expand Down
15 changes: 15 additions & 0 deletions tests/coordinates/test_interp.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,21 @@ def test_simplify(self):
coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp})
assert len(coord.simplify(1.0).tie_indices) == 2

def test_simplify_datetime(self):
t0 = np.datetime64("2000-01-01T00:00:00")
xp = np.sort(np.random.choice(10000, 1000, replace=False))
xp[0] = 0
xp[-1] = 10000
yp = (
t0
+ xp.astype("timedelta64[s]")
+ np.random.randint(-500, 500, size=1000).astype("timedelta64[ms]")
)
coord = InterpCoordinate({"tie_indices": xp, "tie_values": yp})
assert len(coord.simplify(np.timedelta64(1, "s")).tie_indices) == 2
assert len(coord.simplify(np.timedelta64(1000, "ms")).tie_indices) == 2
assert len(coord.simplify(1.0).tie_indices) == 2

def test_singleton(self):
coord = InterpCoordinate({"tie_indices": [0], "tie_values": [1.0]})
assert coord[0].values == 1.0
Expand Down
41 changes: 39 additions & 2 deletions tests/coordinates/test_sampled.py
Original file line number Diff line number Diff line change
Expand Up @@ -678,6 +678,23 @@ def test_simplify_with_tolerance(self):
result = coord.simplify(tolerance=0.1)
assert np.all(np.abs(result.values - coord.values) <= 0.1)

def test_simplify_with_tolerance_on_datetime(self):
t0 = np.datetime64("2000-01-01T00:00:00")
jitter = np.random.rand(100) * 0.2 - 0.1
jitter = jitter.astype("timedelta64[ms]") # convert to timedelta
coord = SampledCoordinate(
{
"tie_values": t0 + 10 * np.arange(100) + jitter,
"tie_lengths": 10 * np.ones(100, dtype=int),
"sampling_interval": np.timedelta64(1, "s"),
}
)
result = coord.simplify(tolerance=np.timedelta64(200, "ms"))
assert len(result.tie_values) == 1
# float tolerance should be treated as seconds
result = coord.simplify(tolerance=0.2)
assert len(result.tie_values) == 1


class TestSampledCoordinateGetIndexer:
def make_coord(self):
Expand Down Expand Up @@ -857,15 +874,15 @@ def test_to_netcdf_and_back(self):


class TestGetSplitIndices:
def test_get_split_indices_no_tolerance(self):
def test_no_tolerance(self):
coord = SampledCoordinate(
{"tie_values": [0.0, 10.0], "tie_lengths": [3, 2], "sampling_interval": 1.0}
)
div_points = coord.get_split_indices()
expected = np.array([3]) # indices where segments end
assert np.array_equal(div_points, expected)

def test_get_split_indices_with_tolerance(self):
def test_with_tolerance(self):
coord = SampledCoordinate(
{
"tie_values": [0.0, 3.1, 10.0],
Expand All @@ -877,6 +894,26 @@ def test_get_split_indices_with_tolerance(self):
expected = np.array([5]) # only the second gap exceeds tolerance
assert np.array_equal(div_points, expected)

def test_with_tolerance_on_datetime(self):
t0 = np.datetime64("2000-01-01T00:00:00")
coord = SampledCoordinate(
{
"tie_values": [
t0,
t0 + np.timedelta64(3, "s") + np.timedelta64(100, "ms"),
t0 + np.timedelta64(10, "s"),
],
"tie_lengths": [3, 2, 2],
"sampling_interval": np.timedelta64(1, "s"),
}
)
div_points = coord.get_split_indices(tolerance=np.timedelta64(200, "ms"))
expected = np.array([5]) # only the second gap exceeds tolerance
assert np.array_equal(div_points, expected)
# float tolerance should be treated as seconds
div_points = coord.get_split_indices(tolerance=0.2)
assert np.array_equal(div_points, expected)


class TestFromBlock:
def test_from_block(self):
Expand Down
57 changes: 40 additions & 17 deletions tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,23 +211,6 @@ def test_asdataarray(self):
for dim in da.dims:
assert np.array_equal(out[dim].values, da[dim].values)

def test_split(self):
da = xd.DataArray(
np.ones(30),
{
"time": {
"tie_indices": [0, 9, 10, 19, 20, 29],
"tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0],
},
},
)
assert xd.concatenate(xd.split(da)).equals(da)
assert xd.split(da, tolerance=20.0)[0].equals(da)

def test_chunk(self):
da = wavelet_wavefronts()
assert xd.concatenate(xd.split(da, 3)).equals(da)

def test_align(self):
da1 = xd.DataArray(np.arange(2), {"x": [0, 1]})
da2 = xd.DataArray(np.arange(3), {"y": [2, 3, 4]})
Expand All @@ -240,3 +223,43 @@ def test_align(self):
da3 = xd.DataArray(np.arange(6).reshape(2, 3), {"x": [1, 2], "y": [2, 3, 4]})
with pytest.raises(ValueError, match="differs from one data array to another"):
xd.align(da1, da2, da3)


class TestSplit:
def test_integer(self):
da = wavelet_wavefronts()
assert xd.concatenate(xd.split(da, 3)).equals(da)

def test_interp(self):
da = xd.DataArray(
np.ones(30),
{
"time": {
"tie_indices": [0, 9, 10, 19, 20, 29],
"tie_values": [0.0, 9.0, 20.0, 29.0, 40.0, 49.0],
},
},
)
assert xd.concatenate(xd.split(da)).equals(da)
assert xd.split(da, tolerance=20.0)[0].equals(da)

def test_interp_datetime(self):
da = xd.DataArray(
np.ones(30),
{
"time": {
"tie_indices": [0, 9, 10, 19, 20, 29],
"tie_values": [
np.datetime64("2000-01-01T00:00:00"),
np.datetime64("2000-01-01T00:00:09"),
np.datetime64("2000-01-01T00:00:20"),
np.datetime64("2000-01-01T00:00:29"),
np.datetime64("2000-01-01T00:00:40"),
np.datetime64("2000-01-01T00:00:49"),
],
},
},
)
assert xd.concatenate(xd.split(da)).equals(da)
assert xd.split(da, tolerance=np.timedelta64(20, "s"))[0].equals(da)
assert xd.split(da, tolerance=20.0)[0].equals(da)
12 changes: 12 additions & 0 deletions xdas/coordinates/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -590,6 +590,18 @@ def parse(data, dim=None):
return data, dim


def parse_tolerance(tolerance, dtype):
if np.issubdtype(dtype, np.datetime64):
if tolerance is None:
tolerance = np.timedelta64(0, "ns")
elif isinstance(tolerance, (int, float)):
tolerance = np.timedelta64(round(tolerance * 1e9), "ns")
else:
if tolerance is None:
tolerance = 0
return tolerance


def get_sampling_interval(da, dim, cast=True):
"""
Returns the sample spacing along a given dimension.
Expand Down
15 changes: 9 additions & 6 deletions xdas/coordinates/interp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
import pandas as pd
from xinterp import forward, inverse

from .core import Coordinate, format_datetime, is_strictly_increasing, parse
from .core import (
Coordinate,
format_datetime,
is_strictly_increasing,
parse,
parse_tolerance,
)


class InterpCoordinate(Coordinate, name="interpolated"):
Expand Down Expand Up @@ -286,11 +292,7 @@ def decimate(self, q):
)

def simplify(self, tolerance=None):
if tolerance is None:
if np.issubdtype(self.dtype, np.datetime64):
tolerance = np.timedelta64(0, "ns")
else:
tolerance = 0.0
tolerance = parse_tolerance(tolerance, self.dtype)
tie_indices, tie_values = douglas_peucker(
self.tie_indices, self.tie_values, tolerance
)
Expand All @@ -302,6 +304,7 @@ def get_split_indices(self, tolerance=None):
(indices,) = np.nonzero(np.diff(self.tie_indices) == 1)
indices += 1
if tolerance is not None:
tolerance = parse_tolerance(tolerance, self.dtype)
deltas = self.tie_values[indices + 1] - self.tie_values[indices]
indices = indices[np.abs(deltas) >= tolerance]
return np.array(
Expand Down
12 changes: 9 additions & 3 deletions xdas/coordinates/sampled.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@

import numpy as np

from .core import Coordinate, format_datetime, is_strictly_increasing, parse
from .core import (
Coordinate,
format_datetime,
is_strictly_increasing,
parse,
parse_tolerance,
)

CODE_TO_UNITS = {
"h": "hours",
Expand Down Expand Up @@ -384,8 +390,7 @@ def decimate(self, q):
return self[::q]

def simplify(self, tolerance=None):
if tolerance is None:
tolerance = np.array(0, dtype=self.sampling_interval.dtype)[()]
tolerance = parse_tolerance(tolerance, self.dtype)
tie_values = [self.tie_values[0]]
tie_lengths = [self.tie_lengths[0]]
for value, length in zip(self.tie_values[1:], self.tie_lengths[1:]):
Expand All @@ -407,6 +412,7 @@ def simplify(self, tolerance=None):
def get_split_indices(self, tolerance=None):
indices = self.tie_indices[1:]
if tolerance is not None:
tolerance = parse_tolerance(tolerance, self.dtype)
deltas = self.tie_values[1:] - (
self.tie_values[:-1] + self.sampling_interval * self.tie_lengths[:-1]
)
Expand Down
29 changes: 19 additions & 10 deletions xdas/core/routines.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def open_mfdatacollection(
The dimension along which the data arrays are concatenated. Default to "first".
tolerance : float of timedelta64, optional
During concatenation, the tolerance to consider that the end of a file is
continuous with beginning of the following one. Default to zero tolerance.
continuous with beginning of the following one. For time coordinates, numeric
values are considered as seconds. Default to zero tolerance.
squeeze : bool, optional
Whether to return a DataArray instead of a DataCollection if the combination
results in a data collection containing a unique data array.
Expand Down Expand Up @@ -118,7 +119,8 @@ def open_mfdatatree(
The dimension along which the data arrays are concatenated. Default to "first".
tolerance : float of timedelta64, optional
During concatenation, the tolerance to consider that the end of a file is
continuous with beginning of the following one. Default to zero tolerance.
continuous with beginning of the following one. For time coordinates, numeric
values are considered as seconds. Default to zero tolerance.
squeeze : bool, optional
Whether to return a DataArray instead of a DataCollection if the combination
results in a data collection containing a unique data array.
Expand Down Expand Up @@ -217,7 +219,8 @@ def collect(
The dimension along which the data arrays are concatenated. Default to "first".
tolerance : float of timedelta64, optional
During concatenation, the tolerance to consider that the end of a file is
continuous with beginning of the following one. Default to zero tolerance.
continuous with beginning of the following one. For time coordinates, numeric
values are considered as seconds. Default to zero tolerance.
squeeze : bool, optional
Whether to return a DataArray instead of a DataCollection if the combination
results in a data collection containing a unique data array.
Expand Down Expand Up @@ -284,7 +287,8 @@ def open_mfdataarray(
The dimension along which the data arrays are concatenated. Default to "first".
tolerance : float of timedelta64, optional
During concatenation, the tolerance to consider that the end of a file is
continuous with beginning of the following one. Default to zero tolerance.
continuous with beginning of the following one. For time coordinates, numeric
values are considered as seconds. Default to zero tolerance.
squeeze : bool, optional
Whether to return a DataArray instead of a DataCollection if the combination
results in a data collection containing a unique data array.
Expand Down Expand Up @@ -430,8 +434,9 @@ def asdataarray(obj, tolerance=None):
obj : object
The objected to convert
tolerance : float or datetime64, optional
For dense coordinates, tolerance error for interpolation representation, by
default zero.
For dense coordinates, tolerance error for interpolation representation.
For time coordinates, numeric values are considered as seconds.
Zero by default.

Returns
-------
Expand Down Expand Up @@ -472,7 +477,8 @@ def combine_by_field(
The dimension along which concatenate. Default to "first".
tolerance : float of timedelta64, optional
The tolerance to consider that the end of a file is continuous with beginning of
the following, zero by default.
the following. For time coordinates, numeric values are considered as seconds.
Zero by default.
squeeze : bool, optional
Whether to return a Database instead of a DataCollection if the combinatison
results in a data collection containing a unique Database.
Expand Down Expand Up @@ -535,7 +541,8 @@ def combine_by_coords(
The dimension along which concatenate. Default to "first".
tolerance : float of timedelta64, optional
The tolerance to consider that the end of a file is continuous with beginning of
the following, zero by default.
the following. For time coordinates, numeric values are considered as seconds.
Zero by default.
squeeze : bool, optional
Whether to return a Database instead of a DataCollection if the combination
results in a data collection containing a unique Database.
Expand Down Expand Up @@ -673,7 +680,8 @@ def concatenate(objs, dim="first", tolerance=None, virtual=None, verbose=None):
The dimension along which concatenate.
tolerance : float of timedelta64, optional
The tolerance to consider that the end of a file is continuous with beginning of
the following, zero by default.
the following, For time coordinates, numeric values are considered as seconds.
Zero by default.
virtual : bool, optional
Whether to create a virtual dataset. It requires that all concatenated
data arrays are virtual. By default tries to create a virtual dataset if possible.
Expand Down Expand Up @@ -769,7 +777,8 @@ def split(da, indices_or_sections="discontinuities", dim="first", tolerance=None
The dimension along which to split, by default "first"
tolerance : float or timedelta64, optional
If `indices_or_sections="discontinuities"` split will only occur on gaps and
overlaps that are bigger than `tolerance`. Zero tolerance by default.
overlaps that are bigger than `tolerance`. For time coordinates, numeric
values are considered as seconds. Zero tolerance by default.

Returns
-------
Expand Down