pha4ge · zwets · Nov 5, 2025 · Nov 5, 2025 · Nov 6, 2025 · Nov 6, 2025
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,7 @@
+.*
+*.egg-info
+*.pyc
+/docs
+/test
+/schema
+__pycache__
diff --git a/Dockerfile b/Dockerfile
@@ -1,8 +1,8 @@
 # base image
-FROM python:3.9
+FROM python:3.9-alpine
 
 # metadata
-LABEL base.image="pathon:3.9"
+LABEL base.image="python:3.9"
 LABEL software="hAMRonization"
 ARG SOFTWARE_VERSION=unspecified
 LABEL software_version=$SOFTWARE_VERSION
@@ -15,15 +15,17 @@ LABEL tags="Genomics"
 # maintainer
 MAINTAINER Finlay Maguire <finlaymaguire@gmail.com>
 
+# add bash so Nextflow can run the container
+RUN apk add --no-cache bash && rm -rf /var/cache/apk/*
+
 # set the working directory in the container
 WORKDIR /hAMRonization
 
-# copy the dependencies file to the working directory
-COPY . /hAMRonization
+# copy the sources into the container
+COPY . /hAMRonization/src
 
-# install dependencies
-RUN python -m pip install hAMRonization
+# install dependencies and clean all up
+RUN python -m pip --no-cache-dir install ./src && rm -rf ./src
 
-# command to run on container start
-ENTRYPOINT ["hamronize"] 
-CMD ["--help"]
+# command to run on container start without args
+CMD ["hamronize", "--help"]
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ This supports a variety of summary options including an [interactive summary](ht
 
 ## Installation
 
-This tool requires python>=3.7 and [pandas](https://pandas.pydata.org/)
+This tool requires python>=3.9 and [pandas](https://pandas.pydata.org/)
 and the latest release can be installed directly from pip, conda, docker, this repository, or from the galaxy toolshed:
 ```
 pip install hAMRonization
@@ -30,16 +30,17 @@ pip install hAMRonization
 Or
 
 ```
-conda create --name hamronization --channel conda-forge --channel bioconda --channel defaults hamronization
+conda create --name hamronization --channel conda-forge --channel bioconda hamronization
 ```
 ![version-on-conda](https://anaconda.org/bioconda/hamronization/badges/version.svg)
 ![conda-download](https://anaconda.org/bioconda/hamronization/badges/downloads.svg)
 ![last-update-on-conda](https://anaconda.org/bioconda/hamronization/badges/latest_release_date.svg)
 
 
-Or to install using docker:
+Or to install and run using docker, podman, singularity:
 ```
-docker pull finlaymaguire/hamronization:latest
+docker pull docker.io/finlaymaguire/hamronization:latest
+docker run --rm docker.io/finlaymaguire/hamronization:latest hamronize --help
 ```
 
 Or to install the latest development version:

diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py
@@ -26,7 +26,7 @@ class ResFinderIterator(hAMRonizedResultIterator):
     def __init__(self, source, metadata):
 
         # We don't use metadata or field mapping so can just defer to super,
-        # which will open source and invoke our parse method with the open stream.
+        # which opens source and invokes our parse() on the open stream.
         super().__init__(source, dict(), metadata)
 
     def parse(self, handle):
@@ -42,7 +42,7 @@ def parse(self, handle):
         # Input data read in ResFinder 4.2+ JSON format.  This has three main elements:
         # - seq_regions: loci/genes that were found, keying into 0 or more phenotypes
         # - seq_variations: mutations that key into a seq_region and 0 or more phenotypes
-        # - phenotypes: antimicriobals keying back into the above objects
+        # - phenotypes: antimicrobials keying back into the above objects
         data = json.load(handle)
 
         # Helpers to fetch database names and versions from the JSON data
@@ -75,7 +75,7 @@ def set_shared_fields(r):
             res.input_gene_start = _get_start_pos(r.get('query_start_pos'), r.get('query_end_pos'))
             res.input_gene_stop = _get_end_pos(r.get('query_start_pos'), r.get('query_end_pos'))
             res.strand_orientation = _get_strand(r.get('query_start_pos'), r.get('query_end_pos'))
-            res.predicted_phenotype = _empty_to_none(", ".join(r.get('phenotypes', [])))
+            res.predicted_phenotype = 'antimicrobial resistance'  # we report only resistant phenotypes
             res.predicted_phenotype_confidence_level = _condense_notes(r.get('notes'), r.get('pmids'))
             res.reference_gene_length = r.get('ref_seq_length')
             res.reference_gene_start = r.get('ref_start_pos')
@@ -91,20 +91,19 @@ def set_shared_fields(r):
 
         # Setter for the hAMRonizedResult fields related to mutations
         def set_variation_fields(r, vs):
-            """Sets the mutation-specific fields in res, aggregating from all variations in vs on region r."""
+            """Sets the mutation-specific fields in res, aggregating from all variations vs."""
 
             # Bags to collect variations, phenotypes and notes across the variations
             _aa_vars = list()
             _nt_vars = list()
-            _codon = list()
-            _phenos = set()
+            _codons = list()
             _notes = set()
             _pmids = set()
 
-            # Iterate v over the variations in vs that lie on region r in order of their position
-            # (variation->regions strangely is a list, so we need to check if r.key is in it)
-            for v in sorted(filter(lambda v: r['key'] in v.get('seq_regions', []), vs),
-                            key=lambda v: v.get('ref_start_pos', 0)):
+            res.genetic_variation_type = NUCLEOTIDE_VARIANT
+
+            # Iterate v over the variations in vs in order of their position
+            for v in sorted(vs, key=lambda v: v.get('ref_start_pos', 0)):
 
                 # May need refinement to properly accommodate inserts and deletes,
                 # though it seems recent Res/PointFinder output uses HGVS coordinates.
@@ -117,19 +116,17 @@ def set_variation_fields(r, vs):
 
                 _cod_chg = v.get('codon_change')
                 if _cod_chg:
-                    _codon.append(v.get('codon_change'))
+                    _codons.append(_cod_chg)
 
                 # Add the content of the list fields to the bags above
-                _phenos.update(v.get('phenotypes', []))
                 _notes.update(v.get('notes', []))
                 _pmids.update(v.get('pmids', []))
 
             # We have collected all variations on region r, now collapse into fields on res
-            res.predicted_phenotype = _empty_to_none(", ".join(filter(None, _phenos)))
             res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids)
             res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars)))
             res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars)))
-            res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codon)) if _codon else None
+            res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codons)) if _codons else None
 
         # --- Do the actual work --- #
 
@@ -138,42 +135,64 @@ def set_variation_fields(r, vs):
         res.analysis_software_name = data['software_name']
         res.analysis_software_version = data['software_version']
 
-        # We flatten the ResFinder data graph as follows
-        # - iterate over all phenotypes p (generally: antimicrobials) that have amr_resistant=true
-        #   - iterate over the seq_regions r referenced by p (generally: resistance genes)
-        #     - for each r report a GENE_PRESENCE
-        #   - group the seq_variations referenced by p by the seq_region r they lie on
-        #   - iterate over the regions r
-        #     - for each r report one AMINO_ACID_VARIANT record, collapsing the seq_variations
-        for p in filter(lambda d: d.get('amr_resistant', False), data['phenotypes'].values()):
-
-            # Set the fields available on the phenotype object
-            res.drug_class = ", ".join(p.get('amr_classes', []))
-            res.antimicrobial_agent = p.get('amr_resistance', "unspecified")
-
-            # Iterate r over the regions (AMR genes) referenced by p, and yield each in turn
-            for r in map(lambda k: data['seq_regions'][k], p.get('seq_regions', [])):
-
+        # To obtain the AMR genes, we flatten the ResFinder data graph as follows
+        # - iterate over each region r
+        #   - iterate over phenotypes p that reference region r and are amr_resistant
+        #     - collect their amr_classes and antimicrobials
+        #   - emit a GENE_PRESENCE record if any AMR was found
+        for r in data['seq_regions'].values():
+            amr_cls = set()
+            amr_res = set()
+
+            # Iterate p over the phenotypes that reference r and have amr_resistant set true
+            # and collect their AMR classes and antimicrobials
+            for p in filter(lambda p: r['key'] in p.get('seq_regions', [])
+                            and p.get('amr_resistant', False), data['phenotypes'].values()):
+                amr_cls.update(p.get('amr_classes', []))
+                amr_res.add(p.get('amr_resistance', "unspecified"))
+
+            # If we collected any AMR we emit the region as a GENE_PRESENCE record
+            if amr_cls or amr_res:
+
+                # Set the fields collected from the phenotypes and from the region object
                 res.genetic_variation_type = GENE_PRESENCE
+                res.drug_class = ", ".join(sorted(amr_cls))
+                res.antimicrobial_agent = ", ".join(sorted(amr_res))
                 set_shared_fields(r)
 
-                # Yield a new hAMRonizedResult ours using super's method as that may do the needful
+                # Yield a new hAMRonizedResult using super's method as that may do the needful
                 yield self.hAMRonize(None, res.__dict__)
 
-            # Collect the list of seq_variations (if any) referenced from phenotype p,
-            # and the set of regions that these mutations lie on, so that we iterate
-            # these regions and "collapse" all mutations for that region onto one record
-            vs = list(map(lambda k: data['seq_variations'][k], p.get('seq_variations', [])))
-            rs = set(fold(lambda a, v: a + v.get('seq_regions', []), [], vs))
-
-            # Iterate r over each region referenced by some set of variations, and yield each
-            for r in map(lambda k: data['seq_regions'][k], rs):
-
+        # For the variants things are slightly more involved, as phenotypes don't reference
+        # seq_regions directly, but through seq_variations.  We have some indirection here.
+
+        for r in data['seq_regions'].values():
+            amr_cls = set()
+            amr_res = set()
+            vs_dict = dict()
+
+            # We want to collect all variations vs that reference region r AND are referenced
+            # by a phenotype p that is amr_resistant.  Along the way we collect from the p
+            # the AMR classes and antimicriobials (to save us another iteration)
+            for v in filter(lambda v: r['key'] in v.get('seq_regions', []), data['seq_variations'].values()):
+                for p in filter(lambda p: v['key'] in p.get('seq_variations', [])
+                                and (p.get('amr_classes') or p.get('amr_resistance'))
+                                and p.get('amr_resistant', False), data['phenotypes'].values()):
+                    amr_cls.update(p.get('amr_classes', []))
+                    amr_res.add(p.get('amr_resistance', "unspecified"))
+                    vs_dict[v['key']] = v # need to do this in inner loop but dups will squish
+
+            # If we collected variants with resistant phenotypes then emit a record
+            if vs_dict:
+
+                # Set fields we collected plus the region and variant ones as above
                 res.genetic_variation_type = NUCLEOTIDE_VARIANT  # default may be overridden
+                res.drug_class = ", ".join(sorted(amr_cls))
+                res.antimicrobial_agent = ", ".join(sorted(amr_res))
                 set_shared_fields(r)
-                set_variation_fields(r, vs)
+                set_variation_fields(r, vs_dict.values())
 
-                # Yield a new hAMRonizedResult ours using super's method as that may do the needful
+                # Yield a new hAMRonizedResult using super's method as that may do the needful
                 yield self.hAMRonize(None, res.__dict__)
 
 
@@ -209,7 +228,7 @@ def _condense_notes(notes, pmids):
     lines += filter(None, notes)
     pmids = list(filter(None, pmids))
     if pmids:
-        lines.append("PMIDs: " + ", ".join(set(pmids)))
+        lines.append("PMIDs: " + ", ".join(sorted(set(pmids))))
     return ". ".join(lines) if lines else None
 
 

diff --git a/test/test_parsing_validity.py b/test/test_parsing_validity.py
@@ -315,11 +315,8 @@ def test_resfinder():
             seen_genes += 1
 
             # it reports these 4 agents separately (even if all on one gene)
-            assert (result.antimicrobial_agent, result.drug_class) in [
-                ('ciprofloxacin', 'quinolone'),
-                ('nalidixic acid', 'quinolone'),
-                ('trimethoprim', 'folate pathway antagonist'),
-                ('chloramphenicol', 'amphenicol')]
+            assert result.antimicrobial_agent == 'chloramphenicol, ciprofloxacin, nalidixic acid, trimethoprim'
+            assert result.drug_class == 'amphenicol, folate pathway antagonist, quinolone'
 
             # assert mandatory fields (5)
             assert result.gene_symbol == "OqxA"
@@ -329,7 +326,7 @@ def test_resfinder():
             assert result.reference_accession == "EU370913"
 
             # optional fields (12)
-            assert result.predicted_phenotype == "ciprofloxacin, nalidixic acid, trimethoprim, chloramphenicol"
+            assert result.predicted_phenotype == "antimicrobial resistance"
             assert result.predicted_phenotype_confidence_level == (
                 "Must be in an operon with oqxB," +
                 "phenotype differs based on genomic location of the operon PMID 25801572," +
@@ -374,7 +371,7 @@ def test_resfinder():
             # optional fields (14)
             assert result.antimicrobial_agent == "ampicillin"
             assert result.drug_class == "beta-lactam"
-            assert result.predicted_phenotype == "ampicillin"
+            assert result.predicted_phenotype == "antimicrobial resistance"
             assert result.predicted_phenotype_confidence_level == (
                 "The nineteen pbp5 mutations must be present simultaneously " +
                 "for resistance phenotype. PMIDs: 25182648")
@@ -414,7 +411,7 @@ def test_resfinder():
             assert result.genetic_variation_type is False  # just to stop
 
     # Check that we saw all
-    assert seen_genes == 4
+    assert seen_genes == 1
     assert seen_variants == 1