From a79437db6e031a38581f28e266e8006d543193f5 Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Wed, 5 Nov 2025 22:12:55 +0300
Subject: [PATCH 1/6] Minor typos and spelling fixes

---
 hAMRonization/ResFinderIO.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py
index b06d8ec..69e9d87 100644
--- a/hAMRonization/ResFinderIO.py
+++ b/hAMRonization/ResFinderIO.py
@@ -26,7 +26,7 @@ class ResFinderIterator(hAMRonizedResultIterator):
     def __init__(self, source, metadata):
 
         # We don't use metadata or field mapping so can just defer to super,
-        # which will open source and invoke our parse method with the open stream.
+        # which opens source and invokes our parse() on the open stream.
         super().__init__(source, dict(), metadata)
 
     def parse(self, handle):
@@ -42,7 +42,7 @@ def parse(self, handle):
         # Input data read in ResFinder 4.2+ JSON format.  This has three main elements:
         # - seq_regions: loci/genes that were found, keying into 0 or more phenotypes
         # - seq_variations: mutations that key into a seq_region and 0 or more phenotypes
-        # - phenotypes: antimicriobals keying back into the above objects
+        # - phenotypes: antimicrobials keying back into the above objects
         data = json.load(handle)
 
         # Helpers to fetch database names and versions from the JSON data
@@ -96,7 +96,7 @@ def set_variation_fields(r, vs):
             # Bags to collect variations, phenotypes and notes across the variations
             _aa_vars = list()
             _nt_vars = list()
-            _codon = list()
+            _codons = list()
             _phenos = set()
             _notes = set()
             _pmids = set()
@@ -117,7 +117,7 @@ def set_variation_fields(r, vs):
 
                 _cod_chg = v.get('codon_change')
                 if _cod_chg:
-                    _codon.append(v.get('codon_change'))
+                    _codons.append(_cod_chg)
 
                 # Add the content of the list fields to the bags above
                 _phenos.update(v.get('phenotypes', []))
@@ -129,7 +129,7 @@ def set_variation_fields(r, vs):
             res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids)
             res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars)))
             res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars)))
-            res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codon)) if _codon else None
+            res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codons)) if _codons else None
 
         # --- Do the actual work --- #
 
@@ -157,7 +157,7 @@ def set_variation_fields(r, vs):
                 res.genetic_variation_type = GENE_PRESENCE
                 set_shared_fields(r)
 
-                # Yield a new hAMRonizedResult ours using super's method as that may do the needful
+                # Yield a new hAMRonizedResult using super's method as that may do the needful
                 yield self.hAMRonize(None, res.__dict__)
 
             # Collect the list of seq_variations (if any) referenced from phenotype p,
@@ -173,7 +173,7 @@ def set_variation_fields(r, vs):
                 set_shared_fields(r)
                 set_variation_fields(r, vs)
 
-                # Yield a new hAMRonizedResult ours using super's method as that may do the needful
+                # Yield a new hAMRonizedResult using super's method as that may do the needful
                 yield self.hAMRonize(None, res.__dict__)
 
 

From 42a85d50c9dce9316ed1e13f1d9c2d71fcaeedc4 Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Wed, 5 Nov 2025 23:37:14 +0300
Subject: [PATCH 2/6] Reduce the Docker container size by 1G

---
 .dockerignore |  7 +++++++
 Dockerfile    | 12 ++++++------
 2 files changed, 13 insertions(+), 6 deletions(-)
 create mode 100644 .dockerignore

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..6b8764e
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,7 @@
+.*
+*.egg-info
+*.pyc
+/docs
+/test
+/schema
+__pycache__
diff --git a/Dockerfile b/Dockerfile
index f5903fa..a052ad8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 # base image
-FROM python:3.9
+FROM python:3.9-alpine
 
 # metadata
 LABEL base.image="pathon:3.9"
@@ -18,12 +18,12 @@ MAINTAINER Finlay Maguire <finlaymaguire@gmail.com>
 # set the working directory in the container
 WORKDIR /hAMRonization
 
-# copy the dependencies file to the working directory
-COPY . /hAMRonization
+# copy the sources into the container
+COPY . /hAMRonization/src
 
-# install dependencies
-RUN python -m pip install hAMRonization
+# install dependencies and clean all up
+RUN python -m pip --no-cache install ./src && rm -rf ./src
 
 # command to run on container start
-ENTRYPOINT ["hamronize"] 
+ENTRYPOINT ["hamronize"]
 CMD ["--help"]

From 2fa7b124afc698b78644cd97ed6b8dd7a4b68030 Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Thu, 6 Nov 2025 09:17:37 +0300
Subject: [PATCH 3/6] Fix typo in Docker image LABEL

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index a052ad8..12a3aa3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,7 +2,7 @@
 FROM python:3.9-alpine
 
 # metadata
-LABEL base.image="pathon:3.9"
+LABEL base.image="python:3.9"
 LABEL software="hAMRonization"
 ARG SOFTWARE_VERSION=unspecified
 LABEL software_version=$SOFTWARE_VERSION

From ef2593c79d650226efb40feb6a0445a0a84e6350 Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Thu, 6 Nov 2025 10:52:56 +0300
Subject: [PATCH 4/6] Make container compatible with Nextflow and Singularity

 - Add bash to container as Nextflow needs this to do its own business
 - Replace forced ENTRYPOINT with soft CMD for Singularity and Nextflow
   + Note: running the container now needs 'hamronize' as first argument,
     this was previously the implicit (and enforced) ENTRYPOINT
 - Document in README the docker run ... invocation line
 - Remove the (unneeded) licensed Conda 'default' channel in README
---
 Dockerfile | 10 ++++++----
 README.md  |  9 +++++----
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 12a3aa3..70991e9 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -15,6 +15,9 @@ LABEL tags="Genomics"
 # maintainer
 MAINTAINER Finlay Maguire <finlaymaguire@gmail.com>
 
+# add bash so Nextflow can run the container
+RUN apk add --no-cache bash && rm -rf /var/cache/apk/*
+
 # set the working directory in the container
 WORKDIR /hAMRonization
 
@@ -22,8 +25,7 @@ WORKDIR /hAMRonization
 COPY . /hAMRonization/src
 
 # install dependencies and clean all up
-RUN python -m pip --no-cache install ./src && rm -rf ./src
+RUN python -m pip --no-cache-dir install ./src && rm -rf ./src
 
-# command to run on container start
-ENTRYPOINT ["hamronize"]
-CMD ["--help"]
+# command to run on container start without args
+CMD ["hamronize", "--help"]
diff --git a/README.md b/README.md
index c7ddd92..3a9971a 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ This supports a variety of summary options including an [interactive summary](ht
 
 ## Installation
 
-This tool requires python>=3.7 and [pandas](https://pandas.pydata.org/)
+This tool requires python>=3.9 and [pandas](https://pandas.pydata.org/)
 and the latest release can be installed directly from pip, conda, docker, this repository, or from the galaxy toolshed:
 ```
 pip install hAMRonization
@@ -30,16 +30,17 @@ pip install hAMRonization
 Or
 
 ```
-conda create --name hamronization --channel conda-forge --channel bioconda --channel defaults hamronization
+conda create --name hamronization --channel conda-forge --channel bioconda hamronization
 ```
 ![version-on-conda](https://anaconda.org/bioconda/hamronization/badges/version.svg)
 ![conda-download](https://anaconda.org/bioconda/hamronization/badges/downloads.svg)
 ![last-update-on-conda](https://anaconda.org/bioconda/hamronization/badges/latest_release_date.svg)
 
 
-Or to install using docker:
+Or to install and run using docker, podman, singularity:
 ```
-docker pull finlaymaguire/hamronization:latest
+docker pull docker.io/finlaymaguire/hamronization:latest
+docker run --rm docker.io/finlaymaguire/hamronization:latest hamronize --help
 ```
 
 Or to install the latest development version:

From 1c5ac5f5dfd1a0b1323f9711dae64f11feedd2d6 Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Tue, 2 Dec 2025 01:31:03 +0300
Subject: [PATCH 5/6] Better align ResFinder hamronized output with AFP and RGI

 - Changed from one line per antimicrobial to one line per gene
---
 hAMRonization/ResFinderIO.py | 89 ++++++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py
index 69e9d87..216c9bd 100644
--- a/hAMRonization/ResFinderIO.py
+++ b/hAMRonization/ResFinderIO.py
@@ -75,7 +75,7 @@ def set_shared_fields(r):
             res.input_gene_start = _get_start_pos(r.get('query_start_pos'), r.get('query_end_pos'))
             res.input_gene_stop = _get_end_pos(r.get('query_start_pos'), r.get('query_end_pos'))
             res.strand_orientation = _get_strand(r.get('query_start_pos'), r.get('query_end_pos'))
-            res.predicted_phenotype = _empty_to_none(", ".join(r.get('phenotypes', [])))
+            res.predicted_phenotype = 'antimicrobial resistance'  # we report only resistant phenotypes
             res.predicted_phenotype_confidence_level = _condense_notes(r.get('notes'), r.get('pmids'))
             res.reference_gene_length = r.get('ref_seq_length')
             res.reference_gene_start = r.get('ref_start_pos')
@@ -91,20 +91,19 @@ def set_shared_fields(r):
 
         # Setter for the hAMRonizedResult fields related to mutations
         def set_variation_fields(r, vs):
-            """Sets the mutation-specific fields in res, aggregating from all variations in vs on region r."""
+            """Sets the mutation-specific fields in res, aggregating from all variations vs."""
 
             # Bags to collect variations, phenotypes and notes across the variations
             _aa_vars = list()
             _nt_vars = list()
             _codons = list()
-            _phenos = set()
             _notes = set()
             _pmids = set()
 
-            # Iterate v over the variations in vs that lie on region r in order of their position
-            # (variation->regions strangely is a list, so we need to check if r.key is in it)
-            for v in sorted(filter(lambda v: r['key'] in v.get('seq_regions', []), vs),
-                            key=lambda v: v.get('ref_start_pos', 0)):
+            res.genetic_variation_type = NUCLEOTIDE_VARIANT
+
+            # Iterate v over the variations in vs in order of their position
+            for v in sorted(vs, key=lambda v: v.get('ref_start_pos', 0)):
 
                 # May need refinement to properly accommodate inserts and deletes,
                 # though it seems recent Res/PointFinder output uses HGVS coordinates.
@@ -120,12 +119,10 @@ def set_variation_fields(r, vs):
                     _codons.append(_cod_chg)
 
                 # Add the content of the list fields to the bags above
-                _phenos.update(v.get('phenotypes', []))
                 _notes.update(v.get('notes', []))
                 _pmids.update(v.get('pmids', []))
 
             # We have collected all variations on region r, now collapse into fields on res
-            res.predicted_phenotype = _empty_to_none(", ".join(filter(None, _phenos)))
             res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids)
             res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars)))
             res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars)))
@@ -138,40 +135,62 @@ def set_variation_fields(r, vs):
         res.analysis_software_name = data['software_name']
         res.analysis_software_version = data['software_version']
 
-        # We flatten the ResFinder data graph as follows
-        # - iterate over all phenotypes p (generally: antimicrobials) that have amr_resistant=true
-        #   - iterate over the seq_regions r referenced by p (generally: resistance genes)
-        #     - for each r report a GENE_PRESENCE
-        #   - group the seq_variations referenced by p by the seq_region r they lie on
-        #   - iterate over the regions r
-        #     - for each r report one AMINO_ACID_VARIANT record, collapsing the seq_variations
-        for p in filter(lambda d: d.get('amr_resistant', False), data['phenotypes'].values()):
-
-            # Set the fields available on the phenotype object
-            res.drug_class = ", ".join(p.get('amr_classes', []))
-            res.antimicrobial_agent = p.get('amr_resistance', "unspecified")
-
-            # Iterate r over the regions (AMR genes) referenced by p, and yield each in turn
-            for r in map(lambda k: data['seq_regions'][k], p.get('seq_regions', [])):
-
+        # To obtain the AMR genes, we flatten the ResFinder data graph as follows
+        # - iterate over each region r
+        #   - iterate over phenotypes p that reference region r and are amr_resistant
+        #     - collect their amr_classes and antimicrobials
+        #   - emit a GENE_PRESENCE record if any AMR was found
+        for r in data['seq_regions'].values():
+            amr_cls = set()
+            amr_res = set()
+
+            # Iterate p over the phenotypes that reference r and have amr_resistant set true
+            # and collect their AMR classes and antimicrobials
+            for p in filter(lambda p: r['key'] in p.get('seq_regions', [])
+                            and p.get('amr_resistant', False), data['phenotypes'].values()):
+                amr_cls.update(p.get('amr_classes', []))
+                amr_res.add(p.get('amr_resistance', "unspecified"))
+
+            # If we collected any AMR we emit the region as a GENE_PRESENCE record
+            if amr_cls or amr_res:
+
+                # Set the fields collected from the phenotypes and from the region object
                 res.genetic_variation_type = GENE_PRESENCE
+                res.drug_class = ", ".join(amr_cls)
+                res.antimicrobial_agent = ", ".join(amr_res)
                 set_shared_fields(r)
 
                 # Yield a new hAMRonizedResult using super's method as that may do the needful
                 yield self.hAMRonize(None, res.__dict__)
 
-            # Collect the list of seq_variations (if any) referenced from phenotype p,
-            # and the set of regions that these mutations lie on, so that we iterate
-            # these regions and "collapse" all mutations for that region onto one record
-            vs = list(map(lambda k: data['seq_variations'][k], p.get('seq_variations', [])))
-            rs = set(fold(lambda a, v: a + v.get('seq_regions', []), [], vs))
-
-            # Iterate r over each region referenced by some set of variations, and yield each
-            for r in map(lambda k: data['seq_regions'][k], rs):
-
+        # For the variants things are slightly more involved, as phenotypes don't reference
+        # seq_regions directly, but through seq_variations.  We have some indirection here.
+
+        for r in data['seq_regions'].values():
+            amr_cls = set()
+            amr_res = set()
+            vs_dict = dict()
+
+            # We want to collect all variations vs that reference region r AND are referenced
+            # by a phenotype p that is amr_resistant.  Along the way we collect from the p
+            # the AMR classes and antimicriobials (to save us another iteration)
+            for v in filter(lambda v: r['key'] in v.get('seq_regions', []), data['seq_variations'].values()):
+                for p in filter(lambda p: v['key'] in p.get('seq_variations', [])
+                                and (p.get('amr_classes') or p.get('amr_resistance'))
+                                and p.get('amr_resistant', False), data['phenotypes'].values()):
+                    amr_cls.update(p.get('amr_classes', []))
+                    amr_res.add(p.get('amr_resistance', "unspecified"))
+                    vs_dict[v['key']] = v # need to do this in inner loop but dups will squish
+
+            # If we collected variants with resistant phenotypes then emit a record
+            if vs_dict:
+
+                # Set fields we collected plus the region and variant ones as above
                 res.genetic_variation_type = NUCLEOTIDE_VARIANT  # default may be overridden
+                res.drug_class = ", ".join(amr_cls)
+                res.antimicrobial_agent = ", ".join(amr_res)
                 set_shared_fields(r)
-                set_variation_fields(r, vs)
+                set_variation_fields(r, vs_dict.values())
 
                 # Yield a new hAMRonizedResult using super's method as that may do the needful
                 yield self.hAMRonize(None, res.__dict__)

From 7c0919943adcd8d2bd97cb4baef0b4e8dcde2c63 Mon Sep 17 00:00:00 2001
From: Marco van Zwetselaar <io@zwets.it>
Date: Tue, 2 Dec 2025 01:56:03 +0300
Subject: [PATCH 6/6] Fix tests for ResFinderIO change

---
 hAMRonization/ResFinderIO.py  | 10 +++++-----
 test/test_parsing_validity.py | 13 +++++--------
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/hAMRonization/ResFinderIO.py b/hAMRonization/ResFinderIO.py
index 216c9bd..8f5137a 100644
--- a/hAMRonization/ResFinderIO.py
+++ b/hAMRonization/ResFinderIO.py
@@ -156,8 +156,8 @@ def set_variation_fields(r, vs):
 
                 # Set the fields collected from the phenotypes and from the region object
                 res.genetic_variation_type = GENE_PRESENCE
-                res.drug_class = ", ".join(amr_cls)
-                res.antimicrobial_agent = ", ".join(amr_res)
+                res.drug_class = ", ".join(sorted(amr_cls))
+                res.antimicrobial_agent = ", ".join(sorted(amr_res))
                 set_shared_fields(r)
 
                 # Yield a new hAMRonizedResult using super's method as that may do the needful
@@ -187,8 +187,8 @@ def set_variation_fields(r, vs):
 
                 # Set fields we collected plus the region and variant ones as above
                 res.genetic_variation_type = NUCLEOTIDE_VARIANT  # default may be overridden
-                res.drug_class = ", ".join(amr_cls)
-                res.antimicrobial_agent = ", ".join(amr_res)
+                res.drug_class = ", ".join(sorted(amr_cls))
+                res.antimicrobial_agent = ", ".join(sorted(amr_res))
                 set_shared_fields(r)
                 set_variation_fields(r, vs_dict.values())
 
@@ -228,7 +228,7 @@ def _condense_notes(notes, pmids):
     lines += filter(None, notes)
     pmids = list(filter(None, pmids))
     if pmids:
-        lines.append("PMIDs: " + ", ".join(set(pmids)))
+        lines.append("PMIDs: " + ", ".join(sorted(set(pmids))))
     return ". ".join(lines) if lines else None
 
 
diff --git a/test/test_parsing_validity.py b/test/test_parsing_validity.py
index a79b073..b9a5eef 100644
--- a/test/test_parsing_validity.py
+++ b/test/test_parsing_validity.py
@@ -315,11 +315,8 @@ def test_resfinder():
             seen_genes += 1
 
             # it reports these 4 agents separately (even if all on one gene)
-            assert (result.antimicrobial_agent, result.drug_class) in [
-                ('ciprofloxacin', 'quinolone'),
-                ('nalidixic acid', 'quinolone'),
-                ('trimethoprim', 'folate pathway antagonist'),
-                ('chloramphenicol', 'amphenicol')]
+            assert result.antimicrobial_agent == 'chloramphenicol, ciprofloxacin, nalidixic acid, trimethoprim'
+            assert result.drug_class == 'amphenicol, folate pathway antagonist, quinolone'
 
             # assert mandatory fields (5)
             assert result.gene_symbol == "OqxA"
@@ -329,7 +326,7 @@ def test_resfinder():
             assert result.reference_accession == "EU370913"
 
             # optional fields (12)
-            assert result.predicted_phenotype == "ciprofloxacin, nalidixic acid, trimethoprim, chloramphenicol"
+            assert result.predicted_phenotype == "antimicrobial resistance"
             assert result.predicted_phenotype_confidence_level == (
                 "Must be in an operon with oqxB," +
                 "phenotype differs based on genomic location of the operon PMID 25801572," +
@@ -374,7 +371,7 @@ def test_resfinder():
             # optional fields (14)
             assert result.antimicrobial_agent == "ampicillin"
             assert result.drug_class == "beta-lactam"
-            assert result.predicted_phenotype == "ampicillin"
+            assert result.predicted_phenotype == "antimicrobial resistance"
             assert result.predicted_phenotype_confidence_level == (
                 "The nineteen pbp5 mutations must be present simultaneously " +
                 "for resistance phenotype. PMIDs: 25182648")
@@ -414,7 +411,7 @@ def test_resfinder():
             assert result.genetic_variation_type is False  # just to stop
 
     # Check that we saw all
-    assert seen_genes == 4
+    assert seen_genes == 1
     assert seen_variants == 1