diff --git a/augur/filter/_run.py b/augur/filter/_run.py index cf35f0b14..48b04d151 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -402,7 +402,7 @@ def run(args): strains_file = None if args.output_strains: strains_file = args.output_strains - elif args.output_sequences: + elif args.output_sequences or args.output_metadata: strains_file = NamedTemporaryFile(delete=False).name if strains_file is not None: @@ -419,14 +419,13 @@ def run(args): write_vcf(args.sequences, args.output_sequences, dropped_samps) else: subset_fasta(args.sequences, args.output_sequences, strains_file, args.nthreads) - if not args.output_strains: - os.remove(strains_file) if args.output_metadata: print_debug(f"Reading metadata from {args.metadata!r} and writing to {args.output_metadata!r}…") - write_output_metadata(args.metadata, args.metadata_delimiters, - args.metadata_id_columns, args.output_metadata, - valid_strains) + write_output_metadata(args.metadata, metadata_object.id_column, args.output_metadata, strains_file) + + if not args.output_strains: + os.remove(strains_file) # Calculate the number of strains that don't exist in either metadata or # sequences. diff --git a/augur/filter/io.py b/augur/filter/io.py index e8061efb9..49b1ed9af 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -1,18 +1,20 @@ import argparse -import csv from argparse import Namespace import os import re +from shlex import quote as shquote +from shutil import which from textwrap import dedent -from typing import Sequence, Set +from typing import Sequence import numpy as np from collections import defaultdict -from xopen import xopen from augur.errors import AugurError from augur.io.file import open_file -from augur.io.metadata import Metadata, METADATA_DATE_COLUMN +from augur.io.metadata import METADATA_DATE_COLUMN from augur.io.print import print_err +from augur.io.shell_command_runner import run_shell_command +from augur.utils import augur from .constants import GROUP_BY_GENERATED_COLUMNS from .include_exclude_rules import extract_variables, parse_filter_query @@ -96,25 +98,29 @@ def constant_factory(value): raise AugurError(f"missing or malformed priority scores file {fname}") -def write_output_metadata(input_metadata_path: str, delimiters: Sequence[str], - id_columns: Sequence[str], output_metadata_path: str, - ids_to_write: Set[str]): +def write_output_metadata(input_filename: str, id_column: str, output_filename: str, ids_file: str): """ - Write output metadata file given input metadata information and a set of IDs - to write. + Write output metadata file given input metadata information and a file + containing ids to write. """ - input_metadata = Metadata(input_metadata_path, delimiters, id_columns) - - with xopen(output_metadata_path, "w", newline="") as output_metadata_handle: - output_metadata = csv.DictWriter(output_metadata_handle, fieldnames=input_metadata.columns, - delimiter="\t", lineterminator=os.linesep) - output_metadata.writeheader() - - # Write outputs based on rows in the original metadata. - for row in input_metadata.rows(): - row_id = row[input_metadata.id_column] - if row_id in ids_to_write: - output_metadata.writerow(row) + # FIXME: make this a function like augur() and seqkit() + tsv_join = which("tsv-join") + + command = f""" + {augur()} read-file {shquote(input_filename)} | + {tsv_join} -H --filter-file <(printf "%s\n" {shquote(id_column)}; cat {shquote(ids_file)}) --key-fields {shquote(id_column)} | + {augur()} write-file {shquote(output_filename)} + """ + + try: + run_shell_command(command, raise_errors=True) + except Exception: + if os.path.isfile(output_filename): + # Remove the partial output file. + os.remove(output_filename) + raise AugurError(f"Metadata output failed, see error(s) above.") + else: + raise AugurError(f"Metadata output failed, see error(s) above. The command may have already written data to stdout. You may want to clean up any partial outputs.") # These are the types accepted in the following function. diff --git a/tests/functional/filter/cram/filter-output-metadata-compressed.t b/tests/functional/filter/cram/filter-output-metadata-compressed.t new file mode 100644 index 000000000..4a2b22906 --- /dev/null +++ b/tests/functional/filter/cram/filter-output-metadata-compressed.t @@ -0,0 +1,29 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Use the same options with 3 different compression methods. + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --subsample-max-sequences 5 \ + > --subsample-seed 0 \ + > --output-metadata filtered_metadata.tsv.gz 2>/dev/null + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --subsample-max-sequences 5 \ + > --subsample-seed 0 \ + > --output-metadata filtered_metadata.tsv.xz 2>/dev/null + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --subsample-max-sequences 5 \ + > --subsample-seed 0 \ + > --output-metadata filtered_metadata.tsv.zst 2>/dev/null + +# The uncompressed outputs are identical. + + $ diff <(gzcat filtered_metadata.tsv.gz) <(xzcat filtered_metadata.tsv.xz) + + $ diff <(gzcat filtered_metadata.tsv.gz) <(zstdcat filtered_metadata.tsv.zst) diff --git a/tests/functional/filter/cram/filter-output-metadata-header.t b/tests/functional/filter/cram/filter-output-metadata-header.t index 15ff155e1..fa4b290a8 100644 --- a/tests/functional/filter/cram/filter-output-metadata-header.t +++ b/tests/functional/filter/cram/filter-output-metadata-header.t @@ -2,10 +2,7 @@ Setup $ source "$TESTDIR"/_setup.sh -Since Pandas's read_csv() and to_csv() are used with a double-quote character as -the default quotechar, any column names with that character may be altered. - -Quoted columns containing the tab delimiter are left unchanged. +Quoting is unchanged regardless of placement. $ cat >metadata.tsv <<~~ > strain "col 1" @@ -19,8 +16,6 @@ Quoted columns containing the tab delimiter are left unchanged. $ head -n 1 filtered_metadata.tsv strain "col 1" -Quoted columns without the tab delimiter are stripped of the quotes. - $ cat >metadata.tsv <<~~ > strain "col1" > SEQ_1 a @@ -31,9 +26,7 @@ Quoted columns without the tab delimiter are stripped of the quotes. > --output-metadata filtered_metadata.tsv 2>/dev/null $ head -n 1 filtered_metadata.tsv - strain col1 - -Any other columns with quotes are quoted, and pre-existing quotes are escsaped by doubling up. + strain "col1" $ cat >metadata.tsv <<~~ > strain col"1 col2" @@ -45,4 +38,4 @@ Any other columns with quotes are quoted, and pre-existing quotes are escsaped b > --output-metadata filtered_metadata.tsv 2>/dev/null $ head -n 1 filtered_metadata.tsv - strain "col""1" "col2""" + strain col"1 col2"