.gitignore

-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,7 @@ MANIFEST @@
     *.log
     *.fits.gz
     stage
+    .eggs
     cheta.egg-info
     sken.egg-info
     Ska/engarchive/GIT_VERSION
@@ Expand All / @@ -22,7 +23,7 @@ dev_utils/event_filter_1.png @@
     dev_utils/event_filter_2.png
     dev_utils/l0_5icd.pdf
     dev_utils/memleak.py
-    *.ipynb
+    play-*.ipynb
     .ipynb_checkpoints
     .idea
     .vscode

cheta/fetch.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ @@
     import collections
     import contextlib
     import fnmatch
+    import itertools
     import logging
     import operator
     import os
@@ Expand Down Expand Up / @@ -140,6 +141,17 @@ @@
     # Cached version (by content type) of first and last available times in archive
     CONTENT_TIME_RANGES = {}
+    # CXC content types that are ephemeris data and need special handling in fetch
+    EPHEM_CONTENT_TYPES = (
+        "orbitephem0",
+        "lunarephem0",
+        "solarephem0",
+        "orbitephem1",
+        "lunarephem1",
+        "solarephem1",
+        "angleephem",
+    )
     def local_or_remote_function(remote_print_output):
         """
@@ Expand Down Expand Up / @@ -190,6 +202,78 @@ def wrapper(*args, **kwargs): @@
         return the_decorator
+    def _is_ephem_msid(msid: str) -> bool:
+        """True if this MSID is an ephemeris MSID
+        Parameters
+        ----------
+        msid : str
+            MSID to check.
+        Returns
+        -------
+        bool
+            True if this MSID is an ephemeris MSID.
+        """
+        return content.get(msid.upper()) in EPHEM_CONTENT_TYPES
+    def _get_ephem_times_ok(times: np.ndarray[float]) -> np.ndarray[bool]:
+        """Select most recent sample times for ephemeris data from overlapping chunks.
+        The raw ephemeris HDF5 archive files are non-monotonic because the archive FITS
+        files are overlapping and have duplicate times. Typically new ephemeris files come
+        out each 7 days and cover many weeks of data (e.g. about 10 weeks for the predictive
+        EPHEM 0 files).
+        The HDF5 files on HEAD have TIME.quality set to bad for the duplicate times, so this
+        code is not strictly required there. But for archives maintained with cheta_sync,
+        the quality codes are not back-updated when new data appear, causing problems
+        (https://github.com/sot/cheta/issues/245).
+        This function ignores the existing quality entirely and dynamically assembles a mask
+        of the best (latest) time sample to use.
+        Parameters
+        ----------
+        times : np.ndarray[float]
+            Array of time samples (seconds since 1998.0) from ephemeris MSID.
+        Returns
+        -------
+        np.ndarray[bool]
+            Boolean mask array that is True for the best samples to keep.
+        """
+        # Baseline times to start at exactly zero and round to nearest 10. Ephem are
+        # nominally sampled at exactly 300 sec intervals, but sometimes there are floating
+        # point variations and sometimes 301 (leap second?), and at least one larger gap of
+        # 43436 seconds.
+        times = np.round(times - times[0], -1)
+        # Times here come from complete and overlapping archive file chunks. This means that
+        # each chunk starts in the middle of the previous chunk and extends beyond. First
+        # get the index of the start of each chunk where time jumps backward.
+        dts = np.diff(times)
+        idx_neg_jumps = np.where(dts < 0)[0] + 1
+        # Get complete chunks as pairs of (idx_start, idx_stop)
+        idx_chunks = np.concatenate([[0], idx_neg_jumps, [len(times)]])
+        chunks = list(itertools.pairwise(idx_chunks))
+        # Make the mask by masking out every time in the previous chunk that is after the
+        # start of the current chunk. This mirrors what is done in archive update processing.
+        # These masks are always correct and up to date for the primary HDF5 archive on
+        # HEAD. However, cheta_sync archives do not have the quality flags back-updated and
+        # so this dynamic masking is needed to get correct results.
+        mask = np.ones(len(times), dtype=bool)
+        for (idx_start0, idx_stop0), (idx_start1, _) in itertools.pairwise(chunks):
+            times0 = times[idx_start0:idx_stop0]
+            i0 = np.searchsorted(times0, times[idx_start1])
+            mask[idx_start0 + i0 : idx_stop0] = False
+        return mask
     def _split_path(path):
         """
         Return a tuple of the components for ``path``. Strip off the drive if
@@ Expand Down Expand Up / @@ -801,10 +885,11 @@ def get_time_data_from_server(h5_slice, filename): @@
                 times_ok, times = get_time_data_from_server(h5_slice, _split_path(filename))
-                # Filter bad times.  Last instance of bad times in archive is 2004
-                # so don't do this unless needed.  Creating a new 'times' array is
-                # much more expensive than checking for np.all(times_ok).
-                times_all_ok = np.all(times_ok)
+                # Filter bad times if needed (rare except ephem). Creating a new 'times'
+                # array is much more expensive than checking for np.all(times_ok). Ephemeris
+                # content types routinely have times quality set bad for overlapped data,
+                # but this is handled instead via _get_ephem_times_ok() so ignore here.
+                times_all_ok = _is_ephem_msid(msid) or np.all(times_ok)
                 if not times_all_ok:
                     times = times[times_ok]
@@ Expand Down Expand Up / @@ -848,6 +933,14 @@ def get_msid_data_from_server(h5_slice, filename): @@
                 bads = bads[times_ok]
                 vals = vals[times_ok]
+            if _is_ephem_msid(msid):
+                # Select the most recent sample at each time for ephemeris data from
+                # available overlapping chunks.
+                ok = _get_ephem_times_ok(times)
+                times = times[ok]
+                vals = vals[ok]
+                bads = bads[ok]
             # Slice down to exact requested time range
             row0, row1 = np.searchsorted(times, [tstart, tstop])
             logger.info("Slicing %s arrays [%d:%d]", msid, row0, row1)
@@ Expand Down @@

cheta/tests/test_fetch.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -93,6 +93,23 @@ def test_filter_bad_times_overlap(): @@
         assert len(dat) == 5
+    @pytest.mark.parametrize(
+        "msid_prefix",
+        [
+            "orbitephem0",
+            "lunarephem0",
+            "solarephem0",
+            "orbitephem1",
+            "lunarephem1",
+            "solarephem1",
+            "point",
+        ],
+    )
+    def test_fix_ephem_time_order(msid_prefix):
+        dat = fetch.MSID(f"{msid_prefix}_x", "2024:001", "2025:001")
+        assert np.unique(np.diff(dat.times)).tolist() == [300.0]
     def test_filter_bad_times_list():
         dat = fetch.MSID("aogyrct1", "2008:291:12:00:00", "2008:298:12:00:00")
         # 2nd test of repr here where we have an MSID object handy
@@ Expand Down @@

validate/PR285-fix-ephemeris-time-order.ipynb

Large diffs are not rendered by default.

Fix problems with time order of ephemeris data #285

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged

taldcroft merged 4 commits into master from fix-cheta-sync-ephem-order

Jan 5, 2026

-Original file line number
+Diff line change
@@ Expand Up / @@ -14,6 +14,7 @@ MANIFEST @@
     *.log
     *.fits.gz
     stage
+    .eggs
     cheta.egg-info
     sken.egg-info
     Ska/engarchive/GIT_VERSION
@@ Expand All / @@ -22,7 +23,7 @@ dev_utils/event_filter_1.png @@
     dev_utils/event_filter_2.png
     dev_utils/l0_5icd.pdf
     dev_utils/memleak.py
-    *.ipynb
+    play-*.ipynb
     .ipynb_checkpoints
     .idea
     .vscode

-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ @@
     import collections
     import contextlib
     import fnmatch
+    import itertools
     import logging
     import operator
     import os
@@ Expand Down Expand Up / @@ -140,6 +141,17 @@ @@
     # Cached version (by content type) of first and last available times in archive
     CONTENT_TIME_RANGES = {}
+    # CXC content types that are ephemeris data and need special handling in fetch
+    EPHEM_CONTENT_TYPES = (
+        "orbitephem0",
+        "lunarephem0",
+        "solarephem0",
+        "orbitephem1",
+        "lunarephem1",
+        "solarephem1",
+        "angleephem",
+    )
     def local_or_remote_function(remote_print_output):
         """
@@ Expand Down Expand Up / @@ -190,6 +202,78 @@ def wrapper(*args, **kwargs): @@
         return the_decorator
+    def _is_ephem_msid(msid: str) -> bool:
+        """True if this MSID is an ephemeris MSID
+        Parameters
+        ----------
+        msid : str
+            MSID to check.
+        Returns
+        -------
+        bool
+            True if this MSID is an ephemeris MSID.
+        """
+        return content.get(msid.upper()) in EPHEM_CONTENT_TYPES
+    def _get_ephem_times_ok(times: np.ndarray[float]) -> np.ndarray[bool]:
+        """Select most recent sample times for ephemeris data from overlapping chunks.
+        The raw ephemeris HDF5 archive files are non-monotonic because the archive FITS
+        files are overlapping and have duplicate times. Typically new ephemeris files come
+        out each 7 days and cover many weeks of data (e.g. about 10 weeks for the predictive
+        EPHEM 0 files).
+        The HDF5 files on HEAD have TIME.quality set to bad for the duplicate times, so this
+        code is not strictly required there. But for archives maintained with cheta_sync,
+        the quality codes are not back-updated when new data appear, causing problems
+        (https://github.com/sot/cheta/issues/245).
+        This function ignores the existing quality entirely and dynamically assembles a mask
+        of the best (latest) time sample to use.
+        Parameters
+        ----------
+        times : np.ndarray[float]
+            Array of time samples (seconds since 1998.0) from ephemeris MSID.
+        Returns
+        -------
+        np.ndarray[bool]
+            Boolean mask array that is True for the best samples to keep.
+        """
+        # Baseline times to start at exactly zero and round to nearest 10. Ephem are
+        # nominally sampled at exactly 300 sec intervals, but sometimes there are floating
+        # point variations and sometimes 301 (leap second?), and at least one larger gap of
+        # 43436 seconds.
+        times = np.round(times - times[0], -1)
+        # Times here come from complete and overlapping archive file chunks. This means that
+        # each chunk starts in the middle of the previous chunk and extends beyond. First
+        # get the index of the start of each chunk where time jumps backward.
+        dts = np.diff(times)
+        idx_neg_jumps = np.where(dts < 0)[0] + 1
+        # Get complete chunks as pairs of (idx_start, idx_stop)
+        idx_chunks = np.concatenate([[0], idx_neg_jumps, [len(times)]])
+        chunks = list(itertools.pairwise(idx_chunks))
+        # Make the mask by masking out every time in the previous chunk that is after the
+        # start of the current chunk. This mirrors what is done in archive update processing.
+        # These masks are always correct and up to date for the primary HDF5 archive on
+        # HEAD. However, cheta_sync archives do not have the quality flags back-updated and
+        # so this dynamic masking is needed to get correct results.
+        mask = np.ones(len(times), dtype=bool)
+        for (idx_start0, idx_stop0), (idx_start1, _) in itertools.pairwise(chunks):
+            times0 = times[idx_start0:idx_stop0]
+            i0 = np.searchsorted(times0, times[idx_start1])
+            mask[idx_start0 + i0 : idx_stop0] = False
+        return mask
     def _split_path(path):
         """
         Return a tuple of the components for ``path``. Strip off the drive if
@@ Expand Down Expand Up / @@ -801,10 +885,11 @@ def get_time_data_from_server(h5_slice, filename): @@
                 times_ok, times = get_time_data_from_server(h5_slice, _split_path(filename))
-                # Filter bad times.  Last instance of bad times in archive is 2004
-                # so don't do this unless needed.  Creating a new 'times' array is
-                # much more expensive than checking for np.all(times_ok).
-                times_all_ok = np.all(times_ok)
+                # Filter bad times if needed (rare except ephem). Creating a new 'times'
+                # array is much more expensive than checking for np.all(times_ok). Ephemeris
+                # content types routinely have times quality set bad for overlapped data,
+                # but this is handled instead via _get_ephem_times_ok() so ignore here.
+                times_all_ok = _is_ephem_msid(msid) or np.all(times_ok)
                 if not times_all_ok:
                     times = times[times_ok]
@@ Expand Down Expand Up / @@ -848,6 +933,14 @@ def get_msid_data_from_server(h5_slice, filename): @@
                 bads = bads[times_ok]
                 vals = vals[times_ok]
+            if _is_ephem_msid(msid):
+                # Select the most recent sample at each time for ephemeris data from
+                # available overlapping chunks.
+                ok = _get_ephem_times_ok(times)
+                times = times[ok]
+                vals = vals[ok]
+                bads = bads[ok]
             # Slice down to exact requested time range
             row0, row1 = np.searchsorted(times, [tstart, tstop])
             logger.info("Slicing %s arrays [%d:%d]", msid, row0, row1)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -93,6 +93,23 @@ def test_filter_bad_times_overlap(): @@
         assert len(dat) == 5
+    @pytest.mark.parametrize(
+        "msid_prefix",
+        [
+            "orbitephem0",
+            "lunarephem0",
+            "solarephem0",
+            "orbitephem1",
+            "lunarephem1",
+            "solarephem1",
+            "point",
+        ],
+    )
+    def test_fix_ephem_time_order(msid_prefix):
+        dat = fetch.MSID(f"{msid_prefix}_x", "2024:001", "2025:001")
+        assert np.unique(np.diff(dat.times)).tolist() == [300.0]
     def test_filter_bad_times_list():
         dat = fetch.MSID("aogyrct1", "2008:291:12:00:00", "2008:298:12:00:00")
         # 2nd test of repr here where we have an MSID object handy
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fix problems with time order of ephemeris data #285

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!