Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
### Features

* filter, frequencies, refine: Added support in metadata for precise date ranges in `YYYY-MM-DD/YYYY-MM-DD` format. [#1304][] (@victorlin)
* refine: Added a new option `--keep-ids` to keep certain tips in the tree regardless of clock filtering. This allows force-inclusion similar to `augur filter`'s `--include` option, and the same file can be used for both. [#1768][] (@victorlin)

### Bug fixes

* export v2: Improved the error message that is displayed when a deprecated coloring key is used. [#1882][] (@corneliusroemer)

[#1304]: https://github.com/nextstrain/augur/issues/1304
[#1768]: https://github.com/nextstrain/augur/issues/1768
[#1882]: https://github.com/nextstrain/augur/issues/1882

## 32.1.0 (18 November 2025)
Expand Down
15 changes: 12 additions & 3 deletions augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .dates import get_numerical_dates
from .dates.errors import InvalidYearBounds
from .io.metadata import DEFAULT_DELIMITERS, DEFAULT_ID_COLUMNS, METADATA_DATE_COLUMN, InvalidDelimiter, Metadata, read_metadata
from .io.strains import read_strains
from .utils import read_tree, write_json, InvalidTreeError
from .errors import AugurError
from treetime.vcf_utils import read_vcf
Expand All @@ -17,9 +18,12 @@
def refine(tree=None, aln=None, ref=None, dates=None, branch_length_inference='auto',
confidence=False, resolve_polytomies=True, stochastic_resolve=False, max_iter=2, precision='auto',
infer_gtr=True, Tc=0.01, reroot=None, use_marginal='always', fixed_pi=None, use_fft=True,
clock_rate=None, clock_std=None, clock_filter_iqd=None, verbosity=1, covariance=True, rng_seed=None, **kwarks):
clock_rate=None, clock_std=None, clock_filter_iqd=None, keep_ids=None, verbosity=1, covariance=True, rng_seed=None, **kwarks):
from treetime import TreeTime

if keep_ids is None:
keep_ids = set()

try: #Tc could be a number or 'opt' or 'skyline'. TreeTime expects a float or int if a number.
Tc = float(Tc)
except ValueError:
Expand Down Expand Up @@ -49,7 +53,7 @@ def refine(tree=None, aln=None, ref=None, dates=None, branch_length_inference='a
# remove them explicitly
leaves = [x for x in tt.tree.get_terminals()]
for n in leaves:
if n.bad_branch:
if n.bad_branch and n.name not in keep_ids:
tt.tree.prune(n)
print('pruning leaf ', n.name)
# fix treetime set-up for new tree topology
Expand Down Expand Up @@ -203,6 +207,7 @@ def register_parser(parent_subparsers):
help='branch length mode of treetime to use')
parser.add_argument('--clock-filter-iqd', type=float, help='clock-filter: remove tips that deviate more than n_iqd '
'interquartile ranges from the root-to-tip vs time regression')
parser.add_argument('--keep-ids', metavar="FILE", help="file containing ids to keep in tree regardless of clock filtering (one per line)")
parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to')
parser.add_argument('--year-bounds', type=int, nargs='+', action=ExtendOverwriteDefault, help='specify min or max & min prediction bounds for samples with XX in year')
parser.add_argument('--divergence-units', type=str, choices=['mutations', 'mutations-per-site'],
Expand Down Expand Up @@ -319,14 +324,18 @@ def run(args):
else:
time_inference_mode = 'always' if args.date_inference=='marginal' else 'never'

keep_ids = set()
if args.keep_ids:
keep_ids = read_strains(args.keep_ids)

tt = refine(tree=T, aln=aln, ref=ref, dates=dates, confidence=args.date_confidence,
reroot=treetime_reroot,
Tc=0.01 if args.coalescent is None else args.coalescent, #use 0.01 as default coalescent time scale
use_marginal = time_inference_mode, use_fft=args.use_fft,
branch_length_inference = args.branch_length_inference or 'auto',
precision = 'auto' if args.precision is None else args.precision,
clock_rate=args.clock_rate, clock_std=args.clock_std_dev,
clock_filter_iqd=args.clock_filter_iqd, max_iter=args.max_iter,
clock_filter_iqd=args.clock_filter_iqd, keep_ids=keep_ids, max_iter=args.max_iter,
covariance=args.covariance, resolve_polytomies=(not args.keep_polytomies),
stochastic_resolve=args.stochastic_resolve, verbosity=args.verbosity, rng_seed=args.seed)

Expand Down
38 changes: 38 additions & 0 deletions tests/functional/refine/cram/keep-ids.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Setup

$ source "$TESTDIR"/_setup.sh

'KX369547.1' is removed with --clock-filter-iqd 2.

$ ${AUGUR} refine \
> --tree "$TESTDIR/../data/tree_raw.nwk" \
> --alignment "$TESTDIR/../data/aligned.fasta" \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --output-tree tree.nwk \
> --timetree \
> --clock-filter-iqd 2 \
> --seed 314159 2>&1 | grep "pruning leaf" || echo "Nothing pruned"
pruning leaf KX369547.1

$ grep -q -F 'KX369547.1' tree.nwk && echo 'Present' || echo 'Pruned'
Pruned

Use --keep-ids to force-include it.

$ cat > include.txt <<~~
> KX369547.1 # Keep me
> ~~

$ ${AUGUR} refine \
> --tree "$TESTDIR/../data/tree_raw.nwk" \
> --alignment "$TESTDIR/../data/aligned.fasta" \
> --metadata "$TESTDIR/../data/metadata.tsv" \
> --output-tree tree.nwk \
> --timetree \
> --clock-filter-iqd 2 \
> --keep-ids include.txt \
> --seed 314159 2>&1 | grep "pruning leaf" || echo "Nothing pruned"
Nothing pruned

$ grep -q -F 'KX369547.1' tree.nwk && echo 'Present' || echo 'Pruned'
Present
Loading