Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.*
*.egg-info
*.pyc
/docs
/test
/schema
__pycache__
20 changes: 11 additions & 9 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# base image
FROM python:3.9
FROM python:3.9-alpine

# metadata
LABEL base.image="pathon:3.9"
LABEL base.image="python:3.9"
LABEL software="hAMRonization"
ARG SOFTWARE_VERSION=unspecified
LABEL software_version=$SOFTWARE_VERSION
Expand All @@ -15,15 +15,17 @@ LABEL tags="Genomics"
# maintainer
MAINTAINER Finlay Maguire <finlaymaguire@gmail.com>

# add bash so Nextflow can run the container
RUN apk add --no-cache bash && rm -rf /var/cache/apk/*

# set the working directory in the container
WORKDIR /hAMRonization

# copy the dependencies file to the working directory
COPY . /hAMRonization
# copy the sources into the container
COPY . /hAMRonization/src

# install dependencies
RUN python -m pip install hAMRonization
# install dependencies and clean all up
RUN python -m pip --no-cache-dir install ./src && rm -rf ./src

# command to run on container start
ENTRYPOINT ["hamronize"]
CMD ["--help"]
# command to run on container start without args
CMD ["hamronize", "--help"]
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ This supports a variety of summary options including an [interactive summary](ht

## Installation

This tool requires python>=3.7 and [pandas](https://pandas.pydata.org/)
This tool requires python>=3.9 and [pandas](https://pandas.pydata.org/)
and the latest release can be installed directly from pip, conda, docker, this repository, or from the galaxy toolshed:
```
pip install hAMRonization
Expand All @@ -30,16 +30,17 @@ pip install hAMRonization
Or

```
conda create --name hamronization --channel conda-forge --channel bioconda --channel defaults hamronization
conda create --name hamronization --channel conda-forge --channel bioconda hamronization
```
![version-on-conda](https://anaconda.org/bioconda/hamronization/badges/version.svg)
![conda-download](https://anaconda.org/bioconda/hamronization/badges/downloads.svg)
![last-update-on-conda](https://anaconda.org/bioconda/hamronization/badges/latest_release_date.svg)


Or to install using docker:
Or to install and run using docker, podman, singularity:
```
docker pull finlaymaguire/hamronization:latest
docker pull docker.io/finlaymaguire/hamronization:latest
docker run --rm docker.io/finlaymaguire/hamronization:latest hamronize --help
```

Or to install the latest development version:
Expand Down
105 changes: 62 additions & 43 deletions hAMRonization/ResFinderIO.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ResFinderIterator(hAMRonizedResultIterator):
def __init__(self, source, metadata):

# We don't use metadata or field mapping so can just defer to super,
# which will open source and invoke our parse method with the open stream.
# which opens source and invokes our parse() on the open stream.
super().__init__(source, dict(), metadata)

def parse(self, handle):
Expand All @@ -42,7 +42,7 @@ def parse(self, handle):
# Input data read in ResFinder 4.2+ JSON format. This has three main elements:
# - seq_regions: loci/genes that were found, keying into 0 or more phenotypes
# - seq_variations: mutations that key into a seq_region and 0 or more phenotypes
# - phenotypes: antimicriobals keying back into the above objects
# - phenotypes: antimicrobials keying back into the above objects
data = json.load(handle)

# Helpers to fetch database names and versions from the JSON data
Expand Down Expand Up @@ -75,7 +75,7 @@ def set_shared_fields(r):
res.input_gene_start = _get_start_pos(r.get('query_start_pos'), r.get('query_end_pos'))
res.input_gene_stop = _get_end_pos(r.get('query_start_pos'), r.get('query_end_pos'))
res.strand_orientation = _get_strand(r.get('query_start_pos'), r.get('query_end_pos'))
res.predicted_phenotype = _empty_to_none(", ".join(r.get('phenotypes', [])))
res.predicted_phenotype = 'antimicrobial resistance' # we report only resistant phenotypes
res.predicted_phenotype_confidence_level = _condense_notes(r.get('notes'), r.get('pmids'))
res.reference_gene_length = r.get('ref_seq_length')
res.reference_gene_start = r.get('ref_start_pos')
Expand All @@ -91,20 +91,19 @@ def set_shared_fields(r):

# Setter for the hAMRonizedResult fields related to mutations
def set_variation_fields(r, vs):
"""Sets the mutation-specific fields in res, aggregating from all variations in vs on region r."""
"""Sets the mutation-specific fields in res, aggregating from all variations vs."""

# Bags to collect variations, phenotypes and notes across the variations
_aa_vars = list()
_nt_vars = list()
_codon = list()
_phenos = set()
_codons = list()
_notes = set()
_pmids = set()

# Iterate v over the variations in vs that lie on region r in order of their position
# (variation->regions strangely is a list, so we need to check if r.key is in it)
for v in sorted(filter(lambda v: r['key'] in v.get('seq_regions', []), vs),
key=lambda v: v.get('ref_start_pos', 0)):
res.genetic_variation_type = NUCLEOTIDE_VARIANT

# Iterate v over the variations in vs in order of their position
for v in sorted(vs, key=lambda v: v.get('ref_start_pos', 0)):

# May need refinement to properly accommodate inserts and deletes,
# though it seems recent Res/PointFinder output uses HGVS coordinates.
Expand All @@ -117,19 +116,17 @@ def set_variation_fields(r, vs):

_cod_chg = v.get('codon_change')
if _cod_chg:
_codon.append(v.get('codon_change'))
_codons.append(_cod_chg)

# Add the content of the list fields to the bags above
_phenos.update(v.get('phenotypes', []))
_notes.update(v.get('notes', []))
_pmids.update(v.get('pmids', []))

# We have collected all variations on region r, now collapse into fields on res
res.predicted_phenotype = _empty_to_none(", ".join(filter(None, _phenos)))
res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids)
res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars)))
res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars)))
res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codon)) if _codon else None
res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codons)) if _codons else None

# --- Do the actual work --- #

Expand All @@ -138,42 +135,64 @@ def set_variation_fields(r, vs):
res.analysis_software_name = data['software_name']
res.analysis_software_version = data['software_version']

# We flatten the ResFinder data graph as follows
# - iterate over all phenotypes p (generally: antimicrobials) that have amr_resistant=true
# - iterate over the seq_regions r referenced by p (generally: resistance genes)
# - for each r report a GENE_PRESENCE
# - group the seq_variations referenced by p by the seq_region r they lie on
# - iterate over the regions r
# - for each r report one AMINO_ACID_VARIANT record, collapsing the seq_variations
for p in filter(lambda d: d.get('amr_resistant', False), data['phenotypes'].values()):

# Set the fields available on the phenotype object
res.drug_class = ", ".join(p.get('amr_classes', []))
res.antimicrobial_agent = p.get('amr_resistance', "unspecified")

# Iterate r over the regions (AMR genes) referenced by p, and yield each in turn
for r in map(lambda k: data['seq_regions'][k], p.get('seq_regions', [])):

# To obtain the AMR genes, we flatten the ResFinder data graph as follows
# - iterate over each region r
# - iterate over phenotypes p that reference region r and are amr_resistant
# - collect their amr_classes and antimicrobials
# - emit a GENE_PRESENCE record if any AMR was found
for r in data['seq_regions'].values():
amr_cls = set()
amr_res = set()

# Iterate p over the phenotypes that reference r and have amr_resistant set true
# and collect their AMR classes and antimicrobials
for p in filter(lambda p: r['key'] in p.get('seq_regions', [])
and p.get('amr_resistant', False), data['phenotypes'].values()):
amr_cls.update(p.get('amr_classes', []))
amr_res.add(p.get('amr_resistance', "unspecified"))

# If we collected any AMR we emit the region as a GENE_PRESENCE record
if amr_cls or amr_res:

# Set the fields collected from the phenotypes and from the region object
res.genetic_variation_type = GENE_PRESENCE
res.drug_class = ", ".join(sorted(amr_cls))
res.antimicrobial_agent = ", ".join(sorted(amr_res))
set_shared_fields(r)

# Yield a new hAMRonizedResult ours using super's method as that may do the needful
# Yield a new hAMRonizedResult using super's method as that may do the needful
yield self.hAMRonize(None, res.__dict__)

# Collect the list of seq_variations (if any) referenced from phenotype p,
# and the set of regions that these mutations lie on, so that we iterate
# these regions and "collapse" all mutations for that region onto one record
vs = list(map(lambda k: data['seq_variations'][k], p.get('seq_variations', [])))
rs = set(fold(lambda a, v: a + v.get('seq_regions', []), [], vs))

# Iterate r over each region referenced by some set of variations, and yield each
for r in map(lambda k: data['seq_regions'][k], rs):

# For the variants things are slightly more involved, as phenotypes don't reference
# seq_regions directly, but through seq_variations. We have some indirection here.

for r in data['seq_regions'].values():
amr_cls = set()
amr_res = set()
vs_dict = dict()

# We want to collect all variations vs that reference region r AND are referenced
# by a phenotype p that is amr_resistant. Along the way we collect from the p
# the AMR classes and antimicriobials (to save us another iteration)
for v in filter(lambda v: r['key'] in v.get('seq_regions', []), data['seq_variations'].values()):
for p in filter(lambda p: v['key'] in p.get('seq_variations', [])
and (p.get('amr_classes') or p.get('amr_resistance'))
and p.get('amr_resistant', False), data['phenotypes'].values()):
amr_cls.update(p.get('amr_classes', []))
amr_res.add(p.get('amr_resistance', "unspecified"))
vs_dict[v['key']] = v # need to do this in inner loop but dups will squish

# If we collected variants with resistant phenotypes then emit a record
if vs_dict:

# Set fields we collected plus the region and variant ones as above
res.genetic_variation_type = NUCLEOTIDE_VARIANT # default may be overridden
res.drug_class = ", ".join(sorted(amr_cls))
res.antimicrobial_agent = ", ".join(sorted(amr_res))
set_shared_fields(r)
set_variation_fields(r, vs)
set_variation_fields(r, vs_dict.values())

# Yield a new hAMRonizedResult ours using super's method as that may do the needful
# Yield a new hAMRonizedResult using super's method as that may do the needful
yield self.hAMRonize(None, res.__dict__)


Expand Down Expand Up @@ -209,7 +228,7 @@ def _condense_notes(notes, pmids):
lines += filter(None, notes)
pmids = list(filter(None, pmids))
if pmids:
lines.append("PMIDs: " + ", ".join(set(pmids)))
lines.append("PMIDs: " + ", ".join(sorted(set(pmids))))
return ". ".join(lines) if lines else None


Expand Down
13 changes: 5 additions & 8 deletions test/test_parsing_validity.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,11 +315,8 @@ def test_resfinder():
seen_genes += 1

# it reports these 4 agents separately (even if all on one gene)
assert (result.antimicrobial_agent, result.drug_class) in [
('ciprofloxacin', 'quinolone'),
('nalidixic acid', 'quinolone'),
('trimethoprim', 'folate pathway antagonist'),
('chloramphenicol', 'amphenicol')]
assert result.antimicrobial_agent == 'chloramphenicol, ciprofloxacin, nalidixic acid, trimethoprim'
assert result.drug_class == 'amphenicol, folate pathway antagonist, quinolone'

# assert mandatory fields (5)
assert result.gene_symbol == "OqxA"
Expand All @@ -329,7 +326,7 @@ def test_resfinder():
assert result.reference_accession == "EU370913"

# optional fields (12)
assert result.predicted_phenotype == "ciprofloxacin, nalidixic acid, trimethoprim, chloramphenicol"
assert result.predicted_phenotype == "antimicrobial resistance"
assert result.predicted_phenotype_confidence_level == (
"Must be in an operon with oqxB," +
"phenotype differs based on genomic location of the operon PMID 25801572," +
Expand Down Expand Up @@ -374,7 +371,7 @@ def test_resfinder():
# optional fields (14)
assert result.antimicrobial_agent == "ampicillin"
assert result.drug_class == "beta-lactam"
assert result.predicted_phenotype == "ampicillin"
assert result.predicted_phenotype == "antimicrobial resistance"
assert result.predicted_phenotype_confidence_level == (
"The nineteen pbp5 mutations must be present simultaneously " +
"for resistance phenotype. PMIDs: 25182648")
Expand Down Expand Up @@ -414,7 +411,7 @@ def test_resfinder():
assert result.genetic_variation_type is False # just to stop

# Check that we saw all
assert seen_genes == 4
assert seen_genes == 1
assert seen_variants == 1


Expand Down