From b5a591fc9b4a597e837d7ffebedc440bea6b4e9e Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 25 Mar 2025 16:33:21 -0700 Subject: [PATCH 1/5] Support precise date ranges This has more flexibility than ambiguous dates in reduced precision which are limited to the range of a single calendar year or month. --- augur/dates/__init__.py | 17 +++++++++++++++++ docs/faq/metadata.rst | 3 +++ tests/dates/test_dates.py | 9 +++++++++ 3 files changed, 29 insertions(+) diff --git a/augur/dates/__init__.py b/augur/dates/__init__.py index 9946fa2f8..b7bffadd1 100644 --- a/augur/dates/__init__.py +++ b/augur/dates/__init__.py @@ -160,6 +160,12 @@ def is_date_ambiguous(date, ambiguous_by): Note that this can support any date format, not just YYYY-MM-DD. """ +RE_DATE_RANGE = re.compile(r'^\d{4}-\d{2}-\d{2}/\d{4}-\d{2}-\d{2}$') +""" +Matches a date range in YYYY-MM-DD/YYYY-MM-DD format. +Note that this is a subset of the ISO 8601 time interval format. +""" + @cache def get_numerical_date_from_value(value, fmt, min_max_year=None) -> Union[float, Tuple[float, float], None]: value = str(value) @@ -199,6 +205,17 @@ def get_numerical_date_from_value(value, fmt, min_max_year=None) -> Union[float, # closest in-bound value. raise InvalidDate(value, str(error)) from error + if RE_DATE_RANGE.match(value): + start, end = value.split("/") + + start = datetime.datetime.strptime(start, "%Y-%m-%d") + end = datetime.datetime.strptime(end , "%Y-%m-%d") + + if start > end: + raise InvalidDate(value, f"Start {start!r} is later than end {end!r}") + + return (date_to_numeric(start), date_to_numeric(end)) + # 4. Return none (silent error) if the date does not match any of the checked formats. return None diff --git a/docs/faq/metadata.rst b/docs/faq/metadata.rst index 23de020a5..692002002 100644 --- a/docs/faq/metadata.rst +++ b/docs/faq/metadata.rst @@ -64,6 +64,9 @@ Ambiguity over a range of dates is supported in the following formats: (e.g.. ``2018``, ``2018-03``) 2. Augur-style reduced precision format, i.e. ISO 8601 format with unknown parts explicitly masked by ``XX`` (e.g. ``2018-XX-XX``, ``2018-03-XX``) +3. `/` range format, where `` and `` are exact dates in `YYYY-MM-DD` format. + This is a subset of `ISO 8601 interval format __`. + (e.g. ``2017-12-01/2018-03-25``) **Geography** diff --git a/tests/dates/test_dates.py b/tests/dates/test_dates.py index ecf7cba93..15f6900b5 100644 --- a/tests/dates/test_dates.py +++ b/tests/dates/test_dates.py @@ -134,6 +134,15 @@ def test_get_numerical_date_from_value_current_day_limit(self): == pytest.approx(2000.138, abs=1e-3) ) + def test_get_numerical_date_from_value_range(self): + assert dates.get_numerical_date_from_value("2019-01-02/2019-03-04", fmt="unused") == ( + pytest.approx(dates.numeric_date(datetime.date(year=2019, month=1, day=2)), abs=1e-3), + pytest.approx(dates.numeric_date(datetime.date(year=2019, month=3, day=4)), abs=1e-3), + ) + + # Using a numeric date as a bound is not valid. + assert dates.get_numerical_date_from_value("2019.0/2019-06-01", fmt="unused") == None + def test_is_date_ambiguous(self): """is_date_ambiguous should return true for ambiguous dates and false for valid dates.""" # Test complete date strings with ambiguous values. From 18422e7ca740dc9ba0ed13d6c3b77b1ee338f655 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 11 Mar 2025 15:47:31 -0700 Subject: [PATCH 2/5] Update changelog --- CHANGES.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index 1385965d4..b86386a15 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,11 +2,16 @@ ## __NEXT__ +### Features + +* filter, frequencies, refine: Added support in metadata for precise date ranges in `YYYY-MM-DD/YYYY-MM-DD` format. [#1304][] (@victorlin) + ### Bug fixes * curate format-dates: Removed redundant warning messages that were previously displayed when using `--failure-reporting "warn"`. [#1816][] (@victorlin) * merge: Fixed a performance bug where input sequence file validation unnecessarily loaded file contents into device memory. [#1820][] (@victorlin) +[#1304]: https://github.com/nextstrain/augur/issues/1304 [#1816]: https://github.com/nextstrain/augur/pull/1816 [#1820]: https://github.com/nextstrain/augur/pull/1820 From 9094e8b8d4271babda5afa653480dfc5ad1b449c Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Jun 2025 00:08:22 -0700 Subject: [PATCH 3/5] =?UTF-8?q?Make=20XXXX-XX-XX=20represent=20(-=E2=88=9E?= =?UTF-8?q?,=E2=88=9E)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the value used by augur curate format-dates to represent a lack of date information. Conceptually, this should be represent a maximally wide date interval. Previously, this value went through AmbiguousDate.range() which returned `[0001-01-01, ]`. The lower end is a limitation of the ISO 8601 format, and the upper limit is a somewhat opinionated default behavior. Since dates are stored numerically, the maximally wide date interval is (-∞,∞). The motivation for making this change is for the upcoming apply-date-bounds command which will need to check whether ends of an interval are defined. It doesn't make sense to check against `0001-01-01` or `` since those are values that can be used to define an interval. --- augur/dates/__init__.py | 18 +++++++++++++++--- tests/dates/test_dates.py | 5 +++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/augur/dates/__init__.py b/augur/dates/__init__.py index b7bffadd1..a77dc27ce 100644 --- a/augur/dates/__init__.py +++ b/augur/dates/__init__.py @@ -154,6 +154,11 @@ def is_date_ambiguous(date, ambiguous_by): Those should be further validated by date conversion functions. """ +RE_AUGUR_UNKNOWN_DATE = re.compile(r'^XXXX-XX-XX$') +""" +Matches an Augur-style unknown date. +""" + RE_AUGUR_AMBIGUOUS_DATE = re.compile(r'.*XX.*') """ Matches an Augur-style ambiguous date with 'XX' used to mask unknown parts of the date. @@ -177,13 +182,20 @@ def get_numerical_date_from_value(value, fmt, min_max_year=None) -> Union[float, except: pass - # 2. Check if value is an ambiguous date in the specified format (fmt). + # 2. Check if value is an unknown date. + # This is checked before ambiguous dates since it is a subset of that with + # special handling. + + if RE_AUGUR_UNKNOWN_DATE.match(value): + return (float("-inf"), float("inf")) + + # 3. Check if value is an ambiguous date in the specified format (fmt). if RE_AUGUR_AMBIGUOUS_DATE.match(value): start, end = AmbiguousDate(value, fmt=fmt).range(min_max_year=min_max_year) return (date_to_numeric(start), date_to_numeric(end)) - # 3. Check formats that are always supported. + # 4. Check formats that are always supported. if RE_NUMERIC_DATE.match(value): return float(value) @@ -216,7 +228,7 @@ def get_numerical_date_from_value(value, fmt, min_max_year=None) -> Union[float, return (date_to_numeric(start), date_to_numeric(end)) - # 4. Return none (silent error) if the date does not match any of the checked formats. + # 5. Return none (silent error) if the date does not match any of the checked formats. return None diff --git a/tests/dates/test_dates.py b/tests/dates/test_dates.py index 15f6900b5..ead8b78a2 100644 --- a/tests/dates/test_dates.py +++ b/tests/dates/test_dates.py @@ -26,6 +26,11 @@ def test_get_numerical_date_from_value_not_ambiguous(self): == pytest.approx(2000.242, abs=1e-3) ) + def test_get_numerical_date_from_value_unknown_date(self): + assert (dates.get_numerical_date_from_value("XXXX-XX-XX", "%Y-%m-%d") + == (float("-inf"), float("inf")) + ) + @pytest.mark.parametrize( "value", [ From 6b16602b24420ecc36fe2bd13cca2bf5e732ec35 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Jun 2025 00:17:49 -0700 Subject: [PATCH 4/5] Initial implementation of curate apply-date-bounds This is a new command that applies lower and/or upper bounds to values in an existing date column. --- augur/curate/__init__.py | 3 +- augur/curate/apply_date_bounds.py | 231 ++++++++++++++++++ .../augur.curate.apply_date_bounds.rst | 7 + docs/api/developer/augur.curate.rst | 1 + .../curate/cram/apply-date-bounds/errors.t | 52 ++++ tests/test_curate_apply_date_bounds.py | 140 +++++++++++ 6 files changed, 433 insertions(+), 1 deletion(-) create mode 100644 augur/curate/apply_date_bounds.py create mode 100644 docs/api/developer/augur.curate.apply_date_bounds.rst create mode 100644 tests/functional/curate/cram/apply-date-bounds/errors.t create mode 100644 tests/test_curate_apply_date_bounds.py diff --git a/augur/curate/__init__.py b/augur/curate/__init__.py index f2af6b7d9..ef55540ee 100644 --- a/augur/curate/__init__.py +++ b/augur/curate/__init__.py @@ -13,7 +13,7 @@ from augur.io.metadata import DEFAULT_DELIMITERS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv from augur.io.sequences import write_records_to_fasta from augur.types import DataErrorMethod -from . import format_dates, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location, transform_strain_name, rename +from . import format_dates, apply_date_bounds, normalize_strings, passthru, titlecase, apply_geolocation_rules, apply_record_annotations, abbreviate_authors, parse_genbank_location, transform_strain_name, rename SUBCOMMAND_ATTRIBUTE = '_curate_subcommand' @@ -21,6 +21,7 @@ passthru, normalize_strings, format_dates, + apply_date_bounds, titlecase, apply_geolocation_rules, apply_record_annotations, diff --git a/augur/curate/apply_date_bounds.py b/augur/curate/apply_date_bounds.py new file mode 100644 index 000000000..f95b8ec2f --- /dev/null +++ b/augur/curate/apply_date_bounds.py @@ -0,0 +1,231 @@ +""" +Impose lower and/or upper bounds on dates in a column. + +Updated values are are formatted as ISO 8601 intervals. +""" +import argparse +import datetime +from textwrap import dedent +from treetime.utils import datestring_from_numeric +from typing import Any, Dict, Iterable, Tuple, Optional, Union + +from augur.dates import date_to_numeric, get_numerical_date_from_value +from augur.errors import AugurError +from augur.io.print import print_err, indented_list +from augur.types import DataErrorMethod +from augur.utils import first_line + + +TODAY = 'today' + + +RecordType = Dict[str, Any] + # maybe Dict[str, str]? + + +def register_parser(parent_subparsers): + parser = parent_subparsers.add_parser("apply-date-bounds", + parents=[parent_subparsers.shared_parser], + help=first_line(__doc__)) + + required = parser.add_argument_group(title="REQUIRED") + required.add_argument("--date-field", metavar="NAME", + help=dedent("""\ + Name of an existing date field to apply bounds to. Values will be + formatted as an interval using bounds provided by --lower-bound + and/or --upper-bound.""")) + + optional = parser.add_argument_group(title="OPTIONAL") + optional.add_argument("--lower-bound", metavar="NAME | DATE", + help=dedent("""\ + Name of an existing date field or date to use as the lower bound for + --date-field (i.e. minimum).""")) + optional.add_argument("--upper-bound", metavar=f"NAME | DATE | {TODAY!r}", + help=dedent(f"""\ + Name of an existing date field or date to use as the upper bound for + --date-field (i.e. maximum). Use {TODAY!r} to set the current date as + the upper bound.""")) + optional.add_argument("--failure-reporting", + type=DataErrorMethod.argtype, + choices=list(DataErrorMethod), + default=DataErrorMethod.ERROR_FIRST, + help="How should failed date formatting be reported.") + + return parser + + +def run(args: argparse.Namespace, records: Iterable[RecordType]): + validate_arguments(args) + + failures = [] + + for index, input_record in enumerate(records): + record = input_record.copy() + try: + record[args.date_field] = Record(record, index).get_bounded_date(args) + except DataError as error: + if args.failure_reporting is DataErrorMethod.SILENT: + continue + if args.failure_reporting is DataErrorMethod.ERROR_FIRST: + raise error + if args.failure_reporting is DataErrorMethod.WARN: + print_err(f"WARNING: {error}") + failures.append(error) + continue + if args.failure_reporting is DataErrorMethod.ERROR_ALL: + failures.append(error) + continue + else: + raise ValueError(f"Encountered unhandled failure reporting method: {args.failure_reporting!r}") + yield record + + if args.failure_reporting is not DataErrorMethod.SILENT and failures: + if args.failure_reporting is DataErrorMethod.ERROR_ALL: + raise AugurError(dedent(f"""\ + Unable to apply bounds. All errors: + {indented_list(map(str, failures), " ")}""")) + + +def validate_arguments(args: argparse.Namespace): + if not args.lower_bound and not args.upper_bound: + raise AugurError("At least one of --lower-bound and --upper-bound is required.") + + +class Record: + """ + Helper class to wrap a record, its id, and arguments for ease of error handling and small functions. + """ + + def __init__(self, data: RecordType, record_id: int) -> None: + self.data = data + self.id = record_id + + def get_bounded_date(self, args: argparse.Namespace) -> str: + """ + Returns a date string representing the date with bounds applied. bounded interval for the given record. + """ + start, end = self.convert_date_to_range(args.date_field) + + lower_bound, upper_bound = self.get_bounds(args.lower_bound, args.upper_bound) + + # If any ends are unbounded, return the original date + if (start == float("-inf") and lower_bound is None) or \ + (end == float( "inf") and upper_bound is None): + return self.data[args.date_field] + + # Error if start or end are out of bounds. + if lower_bound and start < lower_bound and end < lower_bound: + self.raise_data_error( + f"{args.date_field!r}={self.data[args.date_field]!r} " + f"is earlier than the lower bound of " + f"{args.lower_bound!r}={self.data[args.lower_bound]!r}" + ) + if upper_bound and start > upper_bound and end > upper_bound: + self.raise_data_error( + f"{args.date_field!r}={self.data[args.date_field]!r} " + f"is later than the upper bound of " + f"{args.upper_bound!r}={self.data[args.upper_bound]!r}" + ) + + # If the target date overlaps with the bounds, apply the bounds. + # The start should be no earlier than the lower bound + # and the end should be no later than the upper bound. + if lower_bound: + start = max(start, lower_bound) + if upper_bound: + end = min(end, upper_bound) + + # ISO 8601 interval in / format + return f"{datestring_from_numeric(start)}/{datestring_from_numeric(end)}" + + def convert_date_to_range(self, date_field: str) -> Tuple[float, float]: + original_date = self.data.get(date_field) + + if original_date is None: + self.raise_data_error( + f"Missing date field {date_field!r}." + ) + + date = get_numerical_date_from_value(original_date, fmt="%Y-%m-%d") + + if date == None: + self.raise_data_error( + f"Unable to parse value from {date_field!r} as a date: {original_date!r}. " + "Consider formatting values with augur curate format-dates before applying bounds." + ) + + start, end = float('-inf'), float('inf') + + if isinstance(date, tuple): + start, end = date + elif isinstance(date, float): + start = date + end = date + elif isinstance(date, int): + start = float(date) + end = float(date) + + return start, end + + def get_bounds(self, lower_bound_field_or_value: Optional[str], upper_bound_field_or_value: Optional[str]) -> Tuple[Optional[float], Optional[float]]: + """ + Returns a tuple representing lower and upper bounds. + """ + lower_bound: Union[float, Tuple[float, float], None] = None + upper_bound: Union[float, Tuple[float, float], None] = None + + if lower_bound_field_or_value is not None: + value = self.data.get(lower_bound_field_or_value) + if value is not None: + # Input is a field name + lower_bound = get_numerical_date_from_value(value, fmt="%Y-%m-%d") + if lower_bound is None: + self.raise_data_error( + f"Unable to parse value from {lower_bound_field_or_value!r} as a date: {value!r}. " + "Consider formatting values with augur curate format-dates before applying bounds." + ) + else: + # Try parsing as a date + lower_bound = get_numerical_date_from_value(lower_bound_field_or_value, fmt="%Y-%m-%d") + if lower_bound is None: + raise AugurError(f"Expected --lower-bound to be a field name or date, but got {lower_bound_field_or_value!r}.") + + if upper_bound_field_or_value == TODAY: + if TODAY in self.data: + raise AugurError(f"{TODAY!r} is ambiguous as it is both an alias to the current date and a field name.") + upper_bound = date_to_numeric(datetime.date.today()) + elif upper_bound_field_or_value is not None: + value = self.data.get(upper_bound_field_or_value) + if value is not None: + # Input is a field name + upper_bound = get_numerical_date_from_value(value, fmt="%Y-%m-%d") + if upper_bound is None: + self.raise_data_error( + f"Unable to parse value from {upper_bound_field_or_value!r} as a date: {value!r}. " + "Consider formatting values with augur curate format-dates before applying bounds." + ) + else: + # Try parsing as a date + upper_bound = get_numerical_date_from_value(upper_bound_field_or_value, fmt="%Y-%m-%d") + if upper_bound is None: + raise AugurError(f"Expected --upper-bound to be a field name or date, but got {upper_bound_field_or_value!r}.") + + # Resolve ranges to single values + if isinstance(lower_bound, tuple): + lower_bound = lower_bound[0] + if isinstance(upper_bound, tuple): + upper_bound = upper_bound[1] + + return lower_bound, upper_bound + + def raise_data_error(self, message: str) -> None: + raise DataError(self.id, message) + + +class DataError(AugurError): + def __init__(self, record_id: int, message: str): + self.record_id = record_id + self.message = message + + def __str__(self): + return f"[record {self.record_id}] {self.message}" diff --git a/docs/api/developer/augur.curate.apply_date_bounds.rst b/docs/api/developer/augur.curate.apply_date_bounds.rst new file mode 100644 index 000000000..a7a84915e --- /dev/null +++ b/docs/api/developer/augur.curate.apply_date_bounds.rst @@ -0,0 +1,7 @@ +augur.curate.apply\_date\_bounds module +============================================= + +.. automodule:: augur.curate.apply_date_bounds + :members: + :show-inheritance: + :undoc-members: diff --git a/docs/api/developer/augur.curate.rst b/docs/api/developer/augur.curate.rst index 5e81dd3fc..551b524a4 100644 --- a/docs/api/developer/augur.curate.rst +++ b/docs/api/developer/augur.curate.rst @@ -13,6 +13,7 @@ Submodules :maxdepth: 4 augur.curate.abbreviate_authors + augur.curate.apply_date_bounds augur.curate.apply_geolocation_rules augur.curate.apply_record_annotations augur.curate.format_dates diff --git a/tests/functional/curate/cram/apply-date-bounds/errors.t b/tests/functional/curate/cram/apply-date-bounds/errors.t new file mode 100644 index 000000000..a75c96d7e --- /dev/null +++ b/tests/functional/curate/cram/apply-date-bounds/errors.t @@ -0,0 +1,52 @@ +Setup + + $ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}" + +Create NDJSON file for testing. + + $ cat >records.ndjson <<~~ + > {"record": 1, "date": "2021", "collectionDate": "2020-01-23"} + > {"record": 2, "date": "2022", "collectionDate": "2020-01-23"} + > ~~ + +The default behavior of data error handling is to stop on the first error. + + $ cat records.ndjson \ + > | ${AUGUR} curate apply-date-bounds \ + > --date-field date \ + > --upper-bound collectionDate 1> /dev/null + ERROR: [record 0] 'date'='2021' is later than the upper bound of 'collectionDate'='2020-01-23' + [2] + +Data errors can be batch reported all at once. + + $ cat records.ndjson \ + > | ${AUGUR} curate apply-date-bounds \ + > --failure-reporting "error_all" \ + > --date-field date \ + > --upper-bound collectionDate 1> /dev/null + ERROR: Unable to apply bounds. All errors: + [record 0] 'date'='2021' is later than the upper bound of 'collectionDate'='2020-01-23' + [record 1] 'date'='2022' is later than the upper bound of 'collectionDate'='2020-01-23' + [2] + +Data errors can emit warnings instead of a failure. + + $ cat records.ndjson \ + > | ${AUGUR} curate apply-date-bounds \ + > --failure-reporting "warn" \ + > --date-field date \ + > --upper-bound collectionDate + WARNING: [record 0] 'date'='2021' is later than the upper bound of 'collectionDate'='2020-01-23' + WARNING: [record 1] 'date'='2022' is later than the upper bound of 'collectionDate'='2020-01-23' + +Errors regarding the bounds themselves are not considered data errors and will stop on +the first error regardless of --failure-reporting. + + $ cat records.ndjson \ + > | ${AUGUR} curate apply-date-bounds \ + > --failure-reporting "silent" \ + > --date-field date \ + > --upper-bound collectionDate2 + ERROR: Expected --upper-bound to be a field name or date, but got 'collectionDate2'. + [2] diff --git a/tests/test_curate_apply_date_bounds.py b/tests/test_curate_apply_date_bounds.py new file mode 100644 index 000000000..72dfa5cc4 --- /dev/null +++ b/tests/test_curate_apply_date_bounds.py @@ -0,0 +1,140 @@ +import pytest +from argparse import Namespace +from freezegun import freeze_time +from augur.curate.apply_date_bounds import Record, DataError +from augur.types import DataErrorMethod +from augur.errors import AugurError + + +def make_args(lower=None, upper=None): + return Namespace( + date_field="date", + lower_bound=lower, + upper_bound=upper, + failure_reporting=DataErrorMethod.ERROR_FIRST + ) + +@pytest.mark.parametrize( + " date , lower , upper , expected_interval", + [ + # An exact date is converted to an interval for explicitness. + ("2020-01-15" , "2020-01-10" , "2020-01-20" , "2020-01-15/2020-01-15"), + + # A date representing an interval can be bounded. + ("2020" , "2020-02-10" , "2020-05-20" , "2020-02-10/2020-05-20"), + ("2020-01-01/2020-07-01" , "2020-02-10" , "2020-05-20" , "2020-02-10/2020-05-20"), + + # Bounds can represent intervals too. + ("2020" , "2020-02" , "2020-05" , "2020-02-01/2020-05-31"), + ], +) +def test_date_formats(date, lower, upper, expected_interval): + """ + Test various date formats in each field. + """ + record = Record({"date": date, "rootDate": lower, "collectionDate": upper}, 0) + args = make_args(lower="rootDate", upper="collectionDate") + assert record.get_bounded_date(args) == expected_interval + + +@pytest.mark.parametrize( + "data, lower, upper, expected_interval", + [ + # An error is shown if it is entirely out of bounds. + ({"date": "2020"}, "2020-01-10", None, "2020-01-10/2020-12-31"), + ({"date": "2020"}, None, "2020-01-10", "2020-01-01/2020-01-10"), + ], +) +def test_constant_bounds(data, lower, upper, expected_interval): + """ + Test handling of constant bounds. + """ + record = Record(data, 0) + args = make_args(lower=lower, upper=upper) + assert record.get_bounded_date(args) == expected_interval + + +@pytest.mark.parametrize( + "data, lower, upper, expected", + [ + # When both bounds are defined, the date is constructed from the bounds. + ( + {"date": "XXXX-XX-XX", "rootDate": "2020-01-10", "collectionDate": "2020-01-20"}, + "rootDate", + "collectionDate", + "2020-01-10/2020-01-20" + ), + # When a single bound is defined, the date is returned unchanged. + ( + {"date": "XXXX-XX-XX", "collectionDate": "2020-01-20"}, + None, + "collectionDate", + "XXXX-XX-XX" + ), + ( + {"date": "XXXX-XX-XX", "rootDate": "2020-01-10"}, + "rootDate", + None, + "XXXX-XX-XX" + ), + ], +) +def test_unknown_date(data, lower, upper, expected): + """ + Test handling unknown date with both or single bounds. + """ + record = Record(data, 0) + args = make_args(lower=lower, upper=upper) + assert record.get_bounded_date(args) == expected + + +@pytest.mark.parametrize( + "data, lower, upper, expected_message_substring", + [ + ({}, None, None, "Missing date field 'date'"), + ({"date": "?"}, None, None, "Unable to parse value from 'date' as a date"), + ({"date": "2020-01-01", "collectionDate": ""}, None, "collectionDate", "Unable to parse value from 'collectionDate' as a date"), + + # An error is shown if it is entirely out of bounds. + ({"date": "2020-01-05", "rootDate": "2020-01-10"}, "rootDate", None, "earlier than the lower bound"), + ({"date": "2020-01-15", "collectionDate": "2020-01-10"}, None, "collectionDate", "later than the upper bound"), + ], +) +def test_data_errors(data, lower, upper, expected_message_substring): + """ + Test various data errors. + """ + record = Record(data, 0) + args = make_args(lower=lower, upper=upper) + with pytest.raises(DataError) as exc: + record.get_bounded_date(args) + assert expected_message_substring in str(exc.value) + + +@pytest.mark.parametrize( + "data, lower, upper, expected_message_substring", + [ + ({"date": "2020-01-15"}, "invalid-bound", None, "Expected --lower-bound to be a field name or date"), + ({"date": "2020-01-15"}, None, "invalid-bound", "Expected --upper-bound to be a field name or date"), + ({"date": "2020-01-15", "today": "something"}, None, "today", "'today' is ambiguous as it is both an alias to the current date and a field name"), + ], +) +def test_user_errors(data, lower, upper, expected_message_substring): + """ + Test various user errors. + """ + record = Record(data, 0) + args = make_args(lower=lower, upper=upper) + with pytest.raises(AugurError) as exc: + record.get_bounded_date(args) + assert expected_message_substring in str(exc.value) + + +@freeze_time("2020-01-15") +def test_today(): + """ + Test special handing of "today" as an upper bound. + """ + record = Record({"date": "2020", "rootDate": "2020-01-10"}, 0) + args = make_args(lower="rootDate", upper="today") + assert record.get_bounded_date(args) == "2020-01-10/2020-01-15" From b4505455a983fff17553cee528617b0a9d626b83 Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 10 Jun 2025 17:26:10 -0700 Subject: [PATCH 5/5] Update changelog --- CHANGES.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGES.md b/CHANGES.md index b86386a15..5c56de21f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -5,6 +5,7 @@ ### Features * filter, frequencies, refine: Added support in metadata for precise date ranges in `YYYY-MM-DD/YYYY-MM-DD` format. [#1304][] (@victorlin) +* A new command, `augur curate apply-date-bounds`, allows imposing lower and/or upper bounds on date values using metadata such as an upper bound of collection date. The modified date values are in the `YYYY-MM-DD/YYYY-MM-DD` format described in the previous feature. [#1494] ### Bug fixes @@ -12,6 +13,7 @@ * merge: Fixed a performance bug where input sequence file validation unnecessarily loaded file contents into device memory. [#1820][] (@victorlin) [#1304]: https://github.com/nextstrain/augur/issues/1304 +[#1494]: https://github.com/nextstrain/augur/issues/1494 [#1816]: https://github.com/nextstrain/augur/pull/1816 [#1820]: https://github.com/nextstrain/augur/pull/1820