Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ MANIFEST
*.log
*.fits.gz
stage
.eggs
cheta.egg-info
sken.egg-info
Ska/engarchive/GIT_VERSION
Expand All @@ -22,7 +23,7 @@ dev_utils/event_filter_1.png
dev_utils/event_filter_2.png
dev_utils/l0_5icd.pdf
dev_utils/memleak.py
*.ipynb
play-*.ipynb
.ipynb_checkpoints
.idea
.vscode
101 changes: 97 additions & 4 deletions cheta/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import collections
import contextlib
import fnmatch
import itertools
import logging
import operator
import os
Expand Down Expand Up @@ -140,6 +141,17 @@
# Cached version (by content type) of first and last available times in archive
CONTENT_TIME_RANGES = {}

# CXC content types that are ephemeris data and need special handling in fetch
EPHEM_CONTENT_TYPES = (
"orbitephem0",
"lunarephem0",
"solarephem0",
"orbitephem1",
"lunarephem1",
"solarephem1",
"angleephem",
)


def local_or_remote_function(remote_print_output):
"""
Expand Down Expand Up @@ -190,6 +202,78 @@ def wrapper(*args, **kwargs):
return the_decorator


def _is_ephem_msid(msid: str) -> bool:
"""True if this MSID is an ephemeris MSID

Parameters
----------
msid : str
MSID to check.

Returns
-------
bool
True if this MSID is an ephemeris MSID.
"""
return content.get(msid.upper()) in EPHEM_CONTENT_TYPES


def _get_ephem_times_ok(times: np.ndarray[float]) -> np.ndarray[bool]:
"""Select most recent sample times for ephemeris data from overlapping chunks.

The raw ephemeris HDF5 archive files are non-monotonic because the archive FITS
files are overlapping and have duplicate times. Typically new ephemeris files come
out each 7 days and cover many weeks of data (e.g. about 10 weeks for the predictive
EPHEM 0 files).

The HDF5 files on HEAD have TIME.quality set to bad for the duplicate times, so this
code is not strictly required there. But for archives maintained with cheta_sync,
the quality codes are not back-updated when new data appear, causing problems
(https://github.com/sot/cheta/issues/245).

This function ignores the existing quality entirely and dynamically assembles a mask
of the best (latest) time sample to use.

Parameters
----------
times : np.ndarray[float]
Array of time samples (seconds since 1998.0) from ephemeris MSID.

Returns
-------
np.ndarray[bool]
Boolean mask array that is True for the best samples to keep.
"""
# Baseline times to start at exactly zero and round to nearest 10. Ephem are
# nominally sampled at exactly 300 sec intervals, but sometimes there are floating
# point variations and sometimes 301 (leap second?), and at least one larger gap of
# 43436 seconds.
times = np.round(times - times[0], -1)

# Times here come from complete and overlapping archive file chunks. This means that
# each chunk starts in the middle of the previous chunk and extends beyond. First
# get the index of the start of each chunk where time jumps backward.
dts = np.diff(times)
idx_neg_jumps = np.where(dts < 0)[0] + 1

# Get complete chunks as pairs of (idx_start, idx_stop)
idx_chunks = np.concatenate([[0], idx_neg_jumps, [len(times)]])
chunks = list(itertools.pairwise(idx_chunks))

# Make the mask by masking out every time in the previous chunk that is after the
# start of the current chunk. This mirrors what is done in archive update processing.
# These masks are always correct and up to date for the primary HDF5 archive on
# HEAD. However, cheta_sync archives do not have the quality flags back-updated and
# so this dynamic masking is needed to get correct results.
mask = np.ones(len(times), dtype=bool)
for (idx_start0, idx_stop0), (idx_start1, _) in itertools.pairwise(chunks):
times0 = times[idx_start0:idx_stop0]
i0 = np.searchsorted(times0, times[idx_start1])
mask[idx_start0 + i0 : idx_stop0] = False

return mask


def _split_path(path):
"""
Return a tuple of the components for ``path``. Strip off the drive if
Expand Down Expand Up @@ -801,10 +885,11 @@ def get_time_data_from_server(h5_slice, filename):

times_ok, times = get_time_data_from_server(h5_slice, _split_path(filename))

# Filter bad times. Last instance of bad times in archive is 2004
# so don't do this unless needed. Creating a new 'times' array is
# much more expensive than checking for np.all(times_ok).
times_all_ok = np.all(times_ok)
# Filter bad times if needed (rare except ephem). Creating a new 'times'
# array is much more expensive than checking for np.all(times_ok). Ephemeris
# content types routinely have times quality set bad for overlapped data,
# but this is handled instead via _get_ephem_times_ok() so ignore here.
times_all_ok = _is_ephem_msid(msid) or np.all(times_ok)
if not times_all_ok:
times = times[times_ok]

Expand Down Expand Up @@ -848,6 +933,14 @@ def get_msid_data_from_server(h5_slice, filename):
bads = bads[times_ok]
vals = vals[times_ok]

if _is_ephem_msid(msid):
# Select the most recent sample at each time for ephemeris data from
# available overlapping chunks.
ok = _get_ephem_times_ok(times)
times = times[ok]
vals = vals[ok]
bads = bads[ok]

# Slice down to exact requested time range
row0, row1 = np.searchsorted(times, [tstart, tstop])
logger.info("Slicing %s arrays [%d:%d]", msid, row0, row1)
Expand Down
17 changes: 17 additions & 0 deletions cheta/tests/test_fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,23 @@ def test_filter_bad_times_overlap():
assert len(dat) == 5


@pytest.mark.parametrize(
"msid_prefix",
[
"orbitephem0",
"lunarephem0",
"solarephem0",
"orbitephem1",
"lunarephem1",
"solarephem1",
"point",
],
)
def test_fix_ephem_time_order(msid_prefix):
dat = fetch.MSID(f"{msid_prefix}_x", "2024:001", "2025:001")
assert np.unique(np.diff(dat.times)).tolist() == [300.0]


def test_filter_bad_times_list():
dat = fetch.MSID("aogyrct1", "2008:291:12:00:00", "2008:298:12:00:00")
# 2nd test of repr here where we have an MSID object handy
Expand Down
435 changes: 435 additions & 0 deletions validate/PR285-fix-ephemeris-time-order.ipynb

Large diffs are not rendered by default.