From adfff1e99f28d8c38977f03d1418a53ea5e11b9b Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 17 Jul 2024 15:17:00 -0400 Subject: [PATCH 1/3] Add tests for compressed metadata outputs --- .../cram/filter-output-metadata-compressed.t | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 tests/functional/filter/cram/filter-output-metadata-compressed.t diff --git a/tests/functional/filter/cram/filter-output-metadata-compressed.t b/tests/functional/filter/cram/filter-output-metadata-compressed.t new file mode 100644 index 000000000..4a2b22906 --- /dev/null +++ b/tests/functional/filter/cram/filter-output-metadata-compressed.t @@ -0,0 +1,29 @@ +Setup + + $ source "$TESTDIR"/_setup.sh + +Use the same options with 3 different compression methods. + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --subsample-max-sequences 5 \ + > --subsample-seed 0 \ + > --output-metadata filtered_metadata.tsv.gz 2>/dev/null + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --subsample-max-sequences 5 \ + > --subsample-seed 0 \ + > --output-metadata filtered_metadata.tsv.xz 2>/dev/null + + $ ${AUGUR} filter \ + > --metadata "$TESTDIR/../data/metadata.tsv" \ + > --subsample-max-sequences 5 \ + > --subsample-seed 0 \ + > --output-metadata filtered_metadata.tsv.zst 2>/dev/null + +# The uncompressed outputs are identical. + + $ diff <(gzcat filtered_metadata.tsv.gz) <(xzcat filtered_metadata.tsv.xz) + + $ diff <(gzcat filtered_metadata.tsv.gz) <(zstdcat filtered_metadata.tsv.zst) From b65e7faa6328a2e66e569b3fec0f6ffe180a5a7b Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Sat, 18 May 2024 14:09:39 -0700 Subject: [PATCH 2/3] =?UTF-8?q?=F0=9F=9A=A7=20Use=20tsv-utils=20for=20--ou?= =?UTF-8?q?tput-metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tsv-join is much faster than the other implementation here (18x faster - 12s vs. 3m43s on the current SARS-CoV-2 GISAID dataset containing 16 million rows). --- augur/filter/_run.py | 9 ++-- augur/filter/io.py | 48 +++++++++++-------- .../cram/filter-output-metadata-header.t | 2 + 3 files changed, 33 insertions(+), 26 deletions(-) diff --git a/augur/filter/_run.py b/augur/filter/_run.py index cf35f0b14..6d5d2b51e 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -419,14 +419,13 @@ def run(args): write_vcf(args.sequences, args.output_sequences, dropped_samps) else: subset_fasta(args.sequences, args.output_sequences, strains_file, args.nthreads) - if not args.output_strains: - os.remove(strains_file) if args.output_metadata: print_debug(f"Reading metadata from {args.metadata!r} and writing to {args.output_metadata!r}…") - write_output_metadata(args.metadata, args.metadata_delimiters, - args.metadata_id_columns, args.output_metadata, - valid_strains) + write_output_metadata(args.metadata, metadata_object.id_column, args.output_metadata, strains_file) + + if not args.output_strains: + os.remove(strains_file) # Calculate the number of strains that don't exist in either metadata or # sequences. diff --git a/augur/filter/io.py b/augur/filter/io.py index e8061efb9..4b8e4eb15 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -1,18 +1,20 @@ import argparse -import csv from argparse import Namespace import os import re +from shlex import quote as shquote +from shutil import which from textwrap import dedent -from typing import Sequence, Set +from typing import Sequence import numpy as np from collections import defaultdict -from xopen import xopen from augur.errors import AugurError from augur.io.file import open_file -from augur.io.metadata import Metadata, METADATA_DATE_COLUMN +from augur.io.metadata import METADATA_DATE_COLUMN from augur.io.print import print_err +from augur.io.shell_command_runner import run_shell_command +from augur.utils import augur from .constants import GROUP_BY_GENERATED_COLUMNS from .include_exclude_rules import extract_variables, parse_filter_query @@ -96,25 +98,29 @@ def constant_factory(value): raise AugurError(f"missing or malformed priority scores file {fname}") -def write_output_metadata(input_metadata_path: str, delimiters: Sequence[str], - id_columns: Sequence[str], output_metadata_path: str, - ids_to_write: Set[str]): +def write_output_metadata(input_filename: str, id_column: str, output_filename: str, ids_file: str): """ - Write output metadata file given input metadata information and a set of IDs - to write. + Write output metadata file given input metadata information and a file + containing ids to write. """ - input_metadata = Metadata(input_metadata_path, delimiters, id_columns) - - with xopen(output_metadata_path, "w", newline="") as output_metadata_handle: - output_metadata = csv.DictWriter(output_metadata_handle, fieldnames=input_metadata.columns, - delimiter="\t", lineterminator=os.linesep) - output_metadata.writeheader() - - # Write outputs based on rows in the original metadata. - for row in input_metadata.rows(): - row_id = row[input_metadata.id_column] - if row_id in ids_to_write: - output_metadata.writerow(row) + # FIXME: make this a function like augur() and seqkit() + tsv_join = which("tsv-join") + + command = f""" + {augur()} read-file {shquote(input_filename)} | + {tsv_join} -H --filter-file {ids_file} --key-fields {id_column} | + {augur()} write-file {shquote(output_filename)} + """ + + try: + run_shell_command(command, raise_errors=True) + except Exception: + if os.path.isfile(output_filename): + # Remove the partial output file. + os.remove(output_filename) + raise AugurError(f"Metadata output failed, see error(s) above.") + else: + raise AugurError(f"Metadata output failed, see error(s) above. The command may have already written data to stdout. You may want to clean up any partial outputs.") # These are the types accepted in the following function. diff --git a/tests/functional/filter/cram/filter-output-metadata-header.t b/tests/functional/filter/cram/filter-output-metadata-header.t index 15ff155e1..abf385285 100644 --- a/tests/functional/filter/cram/filter-output-metadata-header.t +++ b/tests/functional/filter/cram/filter-output-metadata-header.t @@ -7,6 +7,8 @@ the default quotechar, any column names with that character may be altered. Quoted columns containing the tab delimiter are left unchanged. +# FIXME: tsv-join has different behavior here. Test both? + $ cat >metadata.tsv <<~~ > strain "col 1" > SEQ_1 a From 0f5911e34cdff9ee308aef4fe3c45911dbafe91b Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Tue, 8 Jul 2025 20:07:48 -0700 Subject: [PATCH 3/3] =?UTF-8?q?fixup!=20=F0=9F=9A=A7=20Use=20tsv-utils=20f?= =?UTF-8?q?or=20--output-metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- augur/filter/_run.py | 2 +- augur/filter/io.py | 2 +- .../filter/cram/filter-output-metadata-header.t | 15 +++------------ 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/augur/filter/_run.py b/augur/filter/_run.py index 6d5d2b51e..48b04d151 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -402,7 +402,7 @@ def run(args): strains_file = None if args.output_strains: strains_file = args.output_strains - elif args.output_sequences: + elif args.output_sequences or args.output_metadata: strains_file = NamedTemporaryFile(delete=False).name if strains_file is not None: diff --git a/augur/filter/io.py b/augur/filter/io.py index 4b8e4eb15..49b1ed9af 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -108,7 +108,7 @@ def write_output_metadata(input_filename: str, id_column: str, output_filename: command = f""" {augur()} read-file {shquote(input_filename)} | - {tsv_join} -H --filter-file {ids_file} --key-fields {id_column} | + {tsv_join} -H --filter-file <(printf "%s\n" {shquote(id_column)}; cat {shquote(ids_file)}) --key-fields {shquote(id_column)} | {augur()} write-file {shquote(output_filename)} """ diff --git a/tests/functional/filter/cram/filter-output-metadata-header.t b/tests/functional/filter/cram/filter-output-metadata-header.t index abf385285..fa4b290a8 100644 --- a/tests/functional/filter/cram/filter-output-metadata-header.t +++ b/tests/functional/filter/cram/filter-output-metadata-header.t @@ -2,12 +2,7 @@ Setup $ source "$TESTDIR"/_setup.sh -Since Pandas's read_csv() and to_csv() are used with a double-quote character as -the default quotechar, any column names with that character may be altered. - -Quoted columns containing the tab delimiter are left unchanged. - -# FIXME: tsv-join has different behavior here. Test both? +Quoting is unchanged regardless of placement. $ cat >metadata.tsv <<~~ > strain "col 1" @@ -21,8 +16,6 @@ Quoted columns containing the tab delimiter are left unchanged. $ head -n 1 filtered_metadata.tsv strain "col 1" -Quoted columns without the tab delimiter are stripped of the quotes. - $ cat >metadata.tsv <<~~ > strain "col1" > SEQ_1 a @@ -33,9 +26,7 @@ Quoted columns without the tab delimiter are stripped of the quotes. > --output-metadata filtered_metadata.tsv 2>/dev/null $ head -n 1 filtered_metadata.tsv - strain col1 - -Any other columns with quotes are quoted, and pre-existing quotes are escsaped by doubling up. + strain "col1" $ cat >metadata.tsv <<~~ > strain col"1 col2" @@ -47,4 +38,4 @@ Any other columns with quotes are quoted, and pre-existing quotes are escsaped b > --output-metadata filtered_metadata.tsv 2>/dev/null $ head -n 1 filtered_metadata.tsv - strain "col""1" "col2""" + strain col"1 col2"