From a79437db6e031a38581f28e266e8006d543193f5 Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Wed, 5 Nov 2025 22:12:55 +0300 Subject: [PATCH 1/6] Minor typos and spelling fixes --- hAMRonization/ResFinderIO.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py index b06d8ec..69e9d87 100644 --- a/hAMRonization/ResFinderIO.py +++ b/hAMRonization/ResFinderIO.py @@ -26,7 +26,7 @@ class ResFinderIterator(hAMRonizedResultIterator): def __init__(self, source, metadata): # We don't use metadata or field mapping so can just defer to super, - # which will open source and invoke our parse method with the open stream. + # which opens source and invokes our parse() on the open stream. super().__init__(source, dict(), metadata) def parse(self, handle): @@ -42,7 +42,7 @@ def parse(self, handle): # Input data read in ResFinder 4.2+ JSON format. This has three main elements: # - seq_regions: loci/genes that were found, keying into 0 or more phenotypes # - seq_variations: mutations that key into a seq_region and 0 or more phenotypes - # - phenotypes: antimicriobals keying back into the above objects + # - phenotypes: antimicrobials keying back into the above objects data = json.load(handle) # Helpers to fetch database names and versions from the JSON data @@ -96,7 +96,7 @@ def set_variation_fields(r, vs): # Bags to collect variations, phenotypes and notes across the variations _aa_vars = list() _nt_vars = list() - _codon = list() + _codons = list() _phenos = set() _notes = set() _pmids = set() @@ -117,7 +117,7 @@ def set_variation_fields(r, vs): _cod_chg = v.get('codon_change') if _cod_chg: - _codon.append(v.get('codon_change')) + _codons.append(_cod_chg) # Add the content of the list fields to the bags above _phenos.update(v.get('phenotypes', [])) @@ -129,7 +129,7 @@ def set_variation_fields(r, vs): res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids) res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars))) res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars))) - res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codon)) if _codon else None + res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codons)) if _codons else None # --- Do the actual work --- # @@ -157,7 +157,7 @@ def set_variation_fields(r, vs): res.genetic_variation_type = GENE_PRESENCE set_shared_fields(r) - # Yield a new hAMRonizedResult ours using super's method as that may do the needful + # Yield a new hAMRonizedResult using super's method as that may do the needful yield self.hAMRonize(None, res.__dict__) # Collect the list of seq_variations (if any) referenced from phenotype p, @@ -173,7 +173,7 @@ def set_variation_fields(r, vs): set_shared_fields(r) set_variation_fields(r, vs) - # Yield a new hAMRonizedResult ours using super's method as that may do the needful + # Yield a new hAMRonizedResult using super's method as that may do the needful yield self.hAMRonize(None, res.__dict__) From 42a85d50c9dce9316ed1e13f1d9c2d71fcaeedc4 Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Wed, 5 Nov 2025 23:37:14 +0300 Subject: [PATCH 2/6] Reduce the Docker container size by 1G --- .dockerignore | 7 +++++++ Dockerfile | 12 ++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..6b8764e --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +.* +*.egg-info +*.pyc +/docs +/test +/schema +__pycache__ diff --git a/Dockerfile b/Dockerfile index f5903fa..a052ad8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # base image -FROM python:3.9 +FROM python:3.9-alpine # metadata LABEL base.image="pathon:3.9" @@ -18,12 +18,12 @@ MAINTAINER Finlay Maguire # set the working directory in the container WORKDIR /hAMRonization -# copy the dependencies file to the working directory -COPY . /hAMRonization +# copy the sources into the container +COPY . /hAMRonization/src -# install dependencies -RUN python -m pip install hAMRonization +# install dependencies and clean all up +RUN python -m pip --no-cache install ./src && rm -rf ./src # command to run on container start -ENTRYPOINT ["hamronize"] +ENTRYPOINT ["hamronize"] CMD ["--help"] From 2fa7b124afc698b78644cd97ed6b8dd7a4b68030 Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Thu, 6 Nov 2025 09:17:37 +0300 Subject: [PATCH 3/6] Fix typo in Docker image LABEL --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a052ad8..12a3aa3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM python:3.9-alpine # metadata -LABEL base.image="pathon:3.9" +LABEL base.image="python:3.9" LABEL software="hAMRonization" ARG SOFTWARE_VERSION=unspecified LABEL software_version=$SOFTWARE_VERSION From ef2593c79d650226efb40feb6a0445a0a84e6350 Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Thu, 6 Nov 2025 10:52:56 +0300 Subject: [PATCH 4/6] Make container compatible with Nextflow and Singularity - Add bash to container as Nextflow needs this to do its own business - Replace forced ENTRYPOINT with soft CMD for Singularity and Nextflow + Note: running the container now needs 'hamronize' as first argument, this was previously the implicit (and enforced) ENTRYPOINT - Document in README the docker run ... invocation line - Remove the (unneeded) licensed Conda 'default' channel in README --- Dockerfile | 10 ++++++---- README.md | 9 +++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 12a3aa3..70991e9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,6 +15,9 @@ LABEL tags="Genomics" # maintainer MAINTAINER Finlay Maguire +# add bash so Nextflow can run the container +RUN apk add --no-cache bash && rm -rf /var/cache/apk/* + # set the working directory in the container WORKDIR /hAMRonization @@ -22,8 +25,7 @@ WORKDIR /hAMRonization COPY . /hAMRonization/src # install dependencies and clean all up -RUN python -m pip --no-cache install ./src && rm -rf ./src +RUN python -m pip --no-cache-dir install ./src && rm -rf ./src -# command to run on container start -ENTRYPOINT ["hamronize"] -CMD ["--help"] +# command to run on container start without args +CMD ["hamronize", "--help"] diff --git a/README.md b/README.md index c7ddd92..3a9971a 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ This supports a variety of summary options including an [interactive summary](ht ## Installation -This tool requires python>=3.7 and [pandas](https://pandas.pydata.org/) +This tool requires python>=3.9 and [pandas](https://pandas.pydata.org/) and the latest release can be installed directly from pip, conda, docker, this repository, or from the galaxy toolshed: ``` pip install hAMRonization @@ -30,16 +30,17 @@ pip install hAMRonization Or ``` -conda create --name hamronization --channel conda-forge --channel bioconda --channel defaults hamronization +conda create --name hamronization --channel conda-forge --channel bioconda hamronization ``` ![version-on-conda](https://anaconda.org/bioconda/hamronization/badges/version.svg) ![conda-download](https://anaconda.org/bioconda/hamronization/badges/downloads.svg) ![last-update-on-conda](https://anaconda.org/bioconda/hamronization/badges/latest_release_date.svg) -Or to install using docker: +Or to install and run using docker, podman, singularity: ``` -docker pull finlaymaguire/hamronization:latest +docker pull docker.io/finlaymaguire/hamronization:latest +docker run --rm docker.io/finlaymaguire/hamronization:latest hamronize --help ``` Or to install the latest development version: From 1c5ac5f5dfd1a0b1323f9711dae64f11feedd2d6 Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Tue, 2 Dec 2025 01:31:03 +0300 Subject: [PATCH 5/6] Better align ResFinder hamronized output with AFP and RGI - Changed from one line per antimicrobial to one line per gene --- hAMRonization/ResFinderIO.py | 89 ++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py index 69e9d87..216c9bd 100644 --- a/hAMRonization/ResFinderIO.py +++ b/hAMRonization/ResFinderIO.py @@ -75,7 +75,7 @@ def set_shared_fields(r): res.input_gene_start = _get_start_pos(r.get('query_start_pos'), r.get('query_end_pos')) res.input_gene_stop = _get_end_pos(r.get('query_start_pos'), r.get('query_end_pos')) res.strand_orientation = _get_strand(r.get('query_start_pos'), r.get('query_end_pos')) - res.predicted_phenotype = _empty_to_none(", ".join(r.get('phenotypes', []))) + res.predicted_phenotype = 'antimicrobial resistance' # we report only resistant phenotypes res.predicted_phenotype_confidence_level = _condense_notes(r.get('notes'), r.get('pmids')) res.reference_gene_length = r.get('ref_seq_length') res.reference_gene_start = r.get('ref_start_pos') @@ -91,20 +91,19 @@ def set_shared_fields(r): # Setter for the hAMRonizedResult fields related to mutations def set_variation_fields(r, vs): - """Sets the mutation-specific fields in res, aggregating from all variations in vs on region r.""" + """Sets the mutation-specific fields in res, aggregating from all variations vs.""" # Bags to collect variations, phenotypes and notes across the variations _aa_vars = list() _nt_vars = list() _codons = list() - _phenos = set() _notes = set() _pmids = set() - # Iterate v over the variations in vs that lie on region r in order of their position - # (variation->regions strangely is a list, so we need to check if r.key is in it) - for v in sorted(filter(lambda v: r['key'] in v.get('seq_regions', []), vs), - key=lambda v: v.get('ref_start_pos', 0)): + res.genetic_variation_type = NUCLEOTIDE_VARIANT + + # Iterate v over the variations in vs in order of their position + for v in sorted(vs, key=lambda v: v.get('ref_start_pos', 0)): # May need refinement to properly accommodate inserts and deletes, # though it seems recent Res/PointFinder output uses HGVS coordinates. @@ -120,12 +119,10 @@ def set_variation_fields(r, vs): _codons.append(_cod_chg) # Add the content of the list fields to the bags above - _phenos.update(v.get('phenotypes', [])) _notes.update(v.get('notes', [])) _pmids.update(v.get('pmids', [])) # We have collected all variations on region r, now collapse into fields on res - res.predicted_phenotype = _empty_to_none(", ".join(filter(None, _phenos))) res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids) res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars))) res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars))) @@ -138,40 +135,62 @@ def set_variation_fields(r, vs): res.analysis_software_name = data['software_name'] res.analysis_software_version = data['software_version'] - # We flatten the ResFinder data graph as follows - # - iterate over all phenotypes p (generally: antimicrobials) that have amr_resistant=true - # - iterate over the seq_regions r referenced by p (generally: resistance genes) - # - for each r report a GENE_PRESENCE - # - group the seq_variations referenced by p by the seq_region r they lie on - # - iterate over the regions r - # - for each r report one AMINO_ACID_VARIANT record, collapsing the seq_variations - for p in filter(lambda d: d.get('amr_resistant', False), data['phenotypes'].values()): - - # Set the fields available on the phenotype object - res.drug_class = ", ".join(p.get('amr_classes', [])) - res.antimicrobial_agent = p.get('amr_resistance', "unspecified") - - # Iterate r over the regions (AMR genes) referenced by p, and yield each in turn - for r in map(lambda k: data['seq_regions'][k], p.get('seq_regions', [])): - + # To obtain the AMR genes, we flatten the ResFinder data graph as follows + # - iterate over each region r + # - iterate over phenotypes p that reference region r and are amr_resistant + # - collect their amr_classes and antimicrobials + # - emit a GENE_PRESENCE record if any AMR was found + for r in data['seq_regions'].values(): + amr_cls = set() + amr_res = set() + + # Iterate p over the phenotypes that reference r and have amr_resistant set true + # and collect their AMR classes and antimicrobials + for p in filter(lambda p: r['key'] in p.get('seq_regions', []) + and p.get('amr_resistant', False), data['phenotypes'].values()): + amr_cls.update(p.get('amr_classes', [])) + amr_res.add(p.get('amr_resistance', "unspecified")) + + # If we collected any AMR we emit the region as a GENE_PRESENCE record + if amr_cls or amr_res: + + # Set the fields collected from the phenotypes and from the region object res.genetic_variation_type = GENE_PRESENCE + res.drug_class = ", ".join(amr_cls) + res.antimicrobial_agent = ", ".join(amr_res) set_shared_fields(r) # Yield a new hAMRonizedResult using super's method as that may do the needful yield self.hAMRonize(None, res.__dict__) - # Collect the list of seq_variations (if any) referenced from phenotype p, - # and the set of regions that these mutations lie on, so that we iterate - # these regions and "collapse" all mutations for that region onto one record - vs = list(map(lambda k: data['seq_variations'][k], p.get('seq_variations', []))) - rs = set(fold(lambda a, v: a + v.get('seq_regions', []), [], vs)) - - # Iterate r over each region referenced by some set of variations, and yield each - for r in map(lambda k: data['seq_regions'][k], rs): - + # For the variants things are slightly more involved, as phenotypes don't reference + # seq_regions directly, but through seq_variations. We have some indirection here. + + for r in data['seq_regions'].values(): + amr_cls = set() + amr_res = set() + vs_dict = dict() + + # We want to collect all variations vs that reference region r AND are referenced + # by a phenotype p that is amr_resistant. Along the way we collect from the p + # the AMR classes and antimicriobials (to save us another iteration) + for v in filter(lambda v: r['key'] in v.get('seq_regions', []), data['seq_variations'].values()): + for p in filter(lambda p: v['key'] in p.get('seq_variations', []) + and (p.get('amr_classes') or p.get('amr_resistance')) + and p.get('amr_resistant', False), data['phenotypes'].values()): + amr_cls.update(p.get('amr_classes', [])) + amr_res.add(p.get('amr_resistance', "unspecified")) + vs_dict[v['key']] = v # need to do this in inner loop but dups will squish + + # If we collected variants with resistant phenotypes then emit a record + if vs_dict: + + # Set fields we collected plus the region and variant ones as above res.genetic_variation_type = NUCLEOTIDE_VARIANT # default may be overridden + res.drug_class = ", ".join(amr_cls) + res.antimicrobial_agent = ", ".join(amr_res) set_shared_fields(r) - set_variation_fields(r, vs) + set_variation_fields(r, vs_dict.values()) # Yield a new hAMRonizedResult using super's method as that may do the needful yield self.hAMRonize(None, res.__dict__) From 7c0919943adcd8d2bd97cb4baef0b4e8dcde2c63 Mon Sep 17 00:00:00 2001 From: Marco van Zwetselaar Date: Tue, 2 Dec 2025 01:56:03 +0300 Subject: [PATCH 6/6] Fix tests for ResFinderIO change --- hAMRonization/ResFinderIO.py | 10 +++++----- test/test_parsing_validity.py | 13 +++++-------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py index 216c9bd..8f5137a 100644 --- a/hAMRonization/ResFinderIO.py +++ b/hAMRonization/ResFinderIO.py @@ -156,8 +156,8 @@ def set_variation_fields(r, vs): # Set the fields collected from the phenotypes and from the region object res.genetic_variation_type = GENE_PRESENCE - res.drug_class = ", ".join(amr_cls) - res.antimicrobial_agent = ", ".join(amr_res) + res.drug_class = ", ".join(sorted(amr_cls)) + res.antimicrobial_agent = ", ".join(sorted(amr_res)) set_shared_fields(r) # Yield a new hAMRonizedResult using super's method as that may do the needful @@ -187,8 +187,8 @@ def set_variation_fields(r, vs): # Set fields we collected plus the region and variant ones as above res.genetic_variation_type = NUCLEOTIDE_VARIANT # default may be overridden - res.drug_class = ", ".join(amr_cls) - res.antimicrobial_agent = ", ".join(amr_res) + res.drug_class = ", ".join(sorted(amr_cls)) + res.antimicrobial_agent = ", ".join(sorted(amr_res)) set_shared_fields(r) set_variation_fields(r, vs_dict.values()) @@ -228,7 +228,7 @@ def _condense_notes(notes, pmids): lines += filter(None, notes) pmids = list(filter(None, pmids)) if pmids: - lines.append("PMIDs: " + ", ".join(set(pmids))) + lines.append("PMIDs: " + ", ".join(sorted(set(pmids)))) return ". ".join(lines) if lines else None diff --git a/test/test_parsing_validity.py b/test/test_parsing_validity.py index a79b073..b9a5eef 100644 --- a/test/test_parsing_validity.py +++ b/test/test_parsing_validity.py @@ -315,11 +315,8 @@ def test_resfinder(): seen_genes += 1 # it reports these 4 agents separately (even if all on one gene) - assert (result.antimicrobial_agent, result.drug_class) in [ - ('ciprofloxacin', 'quinolone'), - ('nalidixic acid', 'quinolone'), - ('trimethoprim', 'folate pathway antagonist'), - ('chloramphenicol', 'amphenicol')] + assert result.antimicrobial_agent == 'chloramphenicol, ciprofloxacin, nalidixic acid, trimethoprim' + assert result.drug_class == 'amphenicol, folate pathway antagonist, quinolone' # assert mandatory fields (5) assert result.gene_symbol == "OqxA" @@ -329,7 +326,7 @@ def test_resfinder(): assert result.reference_accession == "EU370913" # optional fields (12) - assert result.predicted_phenotype == "ciprofloxacin, nalidixic acid, trimethoprim, chloramphenicol" + assert result.predicted_phenotype == "antimicrobial resistance" assert result.predicted_phenotype_confidence_level == ( "Must be in an operon with oqxB," + "phenotype differs based on genomic location of the operon PMID 25801572," + @@ -374,7 +371,7 @@ def test_resfinder(): # optional fields (14) assert result.antimicrobial_agent == "ampicillin" assert result.drug_class == "beta-lactam" - assert result.predicted_phenotype == "ampicillin" + assert result.predicted_phenotype == "antimicrobial resistance" assert result.predicted_phenotype_confidence_level == ( "The nineteen pbp5 mutations must be present simultaneously " + "for resistance phenotype. PMIDs: 25182648") @@ -414,7 +411,7 @@ def test_resfinder(): assert result.genetic_variation_type is False # just to stop # Check that we saw all - assert seen_genes == 4 + assert seen_genes == 1 assert seen_variants == 1