diff --git a/CHANGES.md b/CHANGES.md index 04ceb10ae..227e0c924 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,8 +2,10 @@ ## __NEXT__ +* `augur refine` will now warn when building a time tree if sequence IDs in the tree don't match metadata IDs, suggesting the use of `--metadata-id-columns` to explicitly set the correct ID column. [#1902][] (@corneliusroemer) * `augur curate apply-record-annotations` will now warn if an annotation was unnecessary, often indicative of the upstream data being updated. [#1893][] (@jameshadfield) +[#1902]: https://github.com/nextstrain/augur/pull/1902 [#1893]: https://github.com/nextstrain/augur/pull/1893 ## 31.5.0 (17 September 2025) diff --git a/augur/refine.py b/augur/refine.py index 183198e15..565b64608 100644 --- a/augur/refine.py +++ b/augur/refine.py @@ -309,6 +309,16 @@ def run(args): except InvalidYearBounds as error: raise AugurError(f"Invalid value for --year-bounds: {error}") + terminal_names = {n.name for n in T.get_terminals()} + matched_ids = terminal_names & set(dates.keys()) + + if len(matched_ids) < len(terminal_names) * 0.5: + print(f"\nWARNING: For {len(terminal_names)} sequence IDs, only {len(matched_ids)} corresponding metadata rows could be matched.", file=sys.stderr) + print(f" Metadata has {len(dates)} entries in total.", file=sys.stderr) + print(f" Metadata is using '{metadata_object.id_column}' as the ID column.", file=sys.stderr) + print(f" You may need to explicitly set the metadata ID column using --metadata-id-columns.", file=sys.stderr) + print(f" By default, the columns {DEFAULT_ID_COLUMNS} are tried in order.\n", file=sys.stderr) + # save input state string for later export for n in T.get_terminals(): if n.name in metadata.index and METADATA_DATE_COLUMN in metadata.columns: diff --git a/tests/functional/refine/cram/metadata-id-mismatch.t b/tests/functional/refine/cram/metadata-id-mismatch.t new file mode 100644 index 000000000..c28e2fb9c --- /dev/null +++ b/tests/functional/refine/cram/metadata-id-mismatch.t @@ -0,0 +1,50 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Create metadata with mismatched IDs by adding a 'name' column with correct IDs +and replacing the 'strain' column with wrong IDs. + + $ awk -F'\t' 'BEGIN {OFS="\t"} NR==1 {print $0,"name"} NR>1 {print "wrong_id_"NR,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$1}' \ + > "$TESTDIR/../data/metadata.tsv" > metadata_mismatch.tsv + +Try building a time tree with mismatched sequence and metadata IDs. +This should produce a warning because the default 'strain' column has wrong IDs. +The command will fail due to insufficient matching IDs, so redirect stdout. + + $ ${AUGUR} refine \ + > --tree "$TESTDIR/../data/tree_raw.nwk" \ + > --alignment "$TESTDIR/../data/aligned.fasta" \ + > --metadata metadata_mismatch.tsv \ + > --output-tree tree_mismatch.nwk \ + > --output-node-data branch_lengths_mismatch.json \ + > --timetree \ + > --seed 314159 > /dev/null + + WARNING: For 11 sequence IDs, only 0 corresponding metadata rows could be matched. + Metadata has 12 entries in total. + Metadata is using 'strain' as the ID column. + You may need to explicitly set the metadata ID column using --metadata-id-columns. + By default, the columns ('strain', 'name') are tried in order. + + ERROR: ERROR: ALMOST NO VALID DATE CONSTRAINTS + + + ERROR from TreeTime: This error is most likely due to a problem with your input data. + Please check your input data and try again. If you continue to have problems, please open a new issue including + the original command and the error above: + + [2] + +Now try with the correct ID column specified. +This should work without the ID mismatch warning. + + $ ${AUGUR} refine \ + > --tree "$TESTDIR/../data/tree_raw.nwk" \ + > --alignment "$TESTDIR/../data/aligned.fasta" \ + > --metadata metadata_mismatch.tsv \ + > --metadata-id-columns name \ + > --output-tree tree_correct.nwk \ + > --output-node-data branch_lengths_correct.json \ + > --timetree \ + > --seed 314159 > /dev/null