Skip to content

Commit

Permalink
Merge pull request #94 from zwets/upgrade-resfinder
Browse files Browse the repository at this point in the history
Upgrade ResFinder parser to 4.6.0
  • Loading branch information
fmaguire authored Dec 27, 2024
2 parents 76ca2f5 + 7335521 commit 1ee9d42
Show file tree
Hide file tree
Showing 13 changed files with 324 additions and 151 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ Currently implemented parsers and the last tool version for which they have been
10. [mykrobe](test/data/raw_outputs/mykrobe/report.json): last updated for v0.8.1
11. [pointfinder](hAMRonization/PointFinderIO.py): last updated for v4.1.0
12. [resfams](hAMRonization/ResFamsIO.py): last updated for hmmer v3.3.2
13. [resfinder](hAMRonization/ResFinderIO.py): last updated for v4.1.0
13. [resfinder](hAMRonization/ResFinderIO.py): last updated for v4.6.0
14. [rgi](hAMRonization/RgiIO.py) (includes RGI-BWT) last updated for v5.2.0
15. [srax](hAMRonization/SraxIO.py): last updated for v1.5
16. [srst2](hAMRonization/Srst2IO.py): last updated for v0.2.0
Expand Down
273 changes: 203 additions & 70 deletions hAMRonization/ResFinderIO.py
Original file line number Diff line number Diff line change
@@ -1,87 +1,220 @@
#!/usr/bin/env python

import csv
import json
from .Interfaces import hAMRonizedResultIterator
from hAMRonization.constants import GENE_PRESENCE
from .hAMRonizedResult import hAMRonizedResult
from hAMRonization.constants import (GENE_PRESENCE, NUCLEOTIDE_VARIANT, AMINO_ACID_VARIANT)

# No required metadata, everything is in the JSON
required_metadata = [
"analysis_software_version",
"reference_database_version",
"input_file_name",
]


# This class is the parser instantiated by the hAMRonize framework to parse
# ResFinder output. Its parse() method must return a hAMRonizedResultIterator
# whose next() method yields the sequence of hAMRonizedResults.
# The confusing bit is that the parser and iterator classes have been smushed
# into a single abstract base class, forcing our class to be both the creator
# of the iterator and the iterator itself.
#
class ResFinderIterator(hAMRonizedResultIterator):
"""
Updated for ResFinder v4.1 using the `ResFinder_results_tab.txt` output
file
Updated for ResFinder v4.6.0. Processes the `data_resfinder.json` file
which consolidates output from ResFinder, PointFinder, and DisinfFinder.
"""

def __init__(self, source, metadata):
metadata["reference_database_name"] = "resfinder"
metadata["analysis_software_name"] = "resfinder"
# even though resfinderv4 runs pointfinder
# parsing mutational resistance requires parsing a different file
# to get mutations (pointfinder)
metadata["genetic_variation_type"] = GENE_PRESENCE
self.metadata = metadata

self.field_mapping = {
"Resistance gene": "gene_symbol",
"Identity": "sequence_identity",
"Alignment Length/Gene Length": None,
"Position in reference": None,
"Contig": "input_sequence_id",
"Position in contig": None,
"Accession no.": "reference_accession",
"Phenotype": "drug_class",
"Coverage": "coverage_percentage",
# decomposed from Position in contig e.g "10432..11277"
"_start": "input_gene_start",
"_stop": "input_gene_stop",
# infered from Position in contig field
"_strand": "strand_orientation",
# Resistance gene is mapped to both symbol and name
"_gene_name": "gene_name",
# decomposed from Alignment Length/Gene Length e.g 1176/1176
"_reference_gene_length": "reference_gene_length",
}

super().__init__(source, self.field_mapping, self.metadata)

# We don't use metadata or field mapping so can just defer to super,
# which will open source and invoke our parse method with the open stream.
super().__init__(source, dict(), metadata)

def parse(self, handle):
"""
Read each and return it
Implements abstract method parse which gets called from __init__ with
the opened source in handle, and needs to set self up as an iterator
that yields the list of results.
"""
reader = csv.DictReader(handle, delimiter="\t")
for result in reader:
result["_gene_name"] = result["Resistance gene"]
# Removes fields that have NA and NA..NA this is typical in the 4.5.0 resfinder when the --inputfastq is implimented
for field, value in result.items():
if value == "NA":
result[field] = None
if value == "NA..NA":
result[field] = None
# If result["Position in contig"] == None then make _start, _stop and _strand = None
if result["Position in contig"] != None:
_start, _stop = result["Position in contig"].split("..")
if _start > _stop :
_strand = "-"
else:
_strand = "+"

result["_start"] = _start
result["_stop"] = _stop
result["_strand"] = _strand
else:
_strand = None
result["_start"] = None
result["_stop"] = None
result["_strand"] = None

_reference_gene_length = result["Alignment Length/Gene Length"].split("/")[
0
]
result["_reference_gene_length"] = _reference_gene_length
yield self.hAMRonize(result, self.metadata)
result = {}

# Result to be returned, with mandatory initialisations of the 9 "positional" args
res = hAMRonizedResult('', '', '', '', '', '', '', '', '')

# Input data read in ResFinder 4.2+ JSON format. This has three main elements:
# - seq_regions: loci/genes that were found, keying into 0 or more phenotypes
# - seq_variations: mutations that key into a seq_region and 0 or more phenotypes
# - phenotypes: antimicriobals keying back into the above objects
data = json.load(handle)

# Helpers to fetch database names and versions from the JSON data
_dbs = data['databases']
_unk_db = {'database_name': "ref-error", 'database_version': "ref-error"}

def _get_db_name(keys):
return _dbs.get(keys[0], _unk_db).get('database_name', "no-name") if keys else "unspecified"

def _get_db_ver(keys):
return _dbs.get(keys[0], _unk_db).get('database_version', "no-version") if keys else "unspecified"

# Setter for the hAMRonizedResult fields related to genes
def set_shared_fields(r):
"""Sets all fields in res that relate to region r. Applies to both gene and mutation results."""

# mandatory
res.gene_symbol = r.get('name', "unspecified")
res.gene_name = r.get('name', "unspecified")
res.reference_accession = r.get('ref_acc', r.get('ref_id', r.get('key', "unknown")))

# optional
res.coverage_percentage = _safe_round(r.get('coverage'), 1)
res.coverage_depth = None # we may have this for mutations detected from reads
res.coverage_ratio = r.get('coverage')/100.0
res.input_sequence_id = r.get('query_id')
res.input_gene_length = _get_length(r.get('query_start_pos'), r.get('query_end_pos'))
res.input_gene_start = _get_start_pos(r.get('query_start_pos'), r.get('query_end_pos'))
res.input_gene_stop = _get_end_pos(r.get('query_start_pos'), r.get('query_end_pos'))
res.strand_orientation = _get_strand(r.get('query_start_pos'), r.get('query_end_pos'))
res.predicted_phenotype = _empty_to_none(", ".join(r.get('phenotypes', [])))
res.predicted_phenotype_confidence_level = _condense_notes(r.get('notes'), r.get('pmids'))
res.reference_gene_length = r.get('ref_seq_length')
res.reference_gene_start = r.get('ref_start_pos')
res.reference_gene_stop = r.get('ref_end_pos')
res.resistance_mechanism = None # This is available in phenotypes.txt but not in the JSON
res.sequence_identity = _safe_round(r.get('identity'), 2)

# Zap these by default, will only be set in seq_variations below
res.amino_acid_mutation = None
res.amino_acid_mutation_interpretation = None
res.nucleotide_mutation = None
res.nucleotide_mutation_interpretation = None

# Setter for the hAMRonizedResult fields related to mutations
def set_variation_fields(r, vs):
"""Sets the mutation-specific fields in res, aggregating from all variations in vs on region r."""

# Bags to collect variations, phenotypes and notes across the variations
_aa_vars = list()
_nt_vars = list()
_codon = list()
_phenos = set()
_notes = set()
_pmids = set()

# Iterate v over the variations in vs that lie on region r in order of their position
# (variation->regions strangely is a list, so we need to check if r.key is in it)
for v in sorted(filter(lambda v: r['key'] in v.get('seq_regions', []), vs),
key=lambda v: v.get('ref_start_pos', 0)):

# May need refinement to properly accommodate inserts and deletes,
# though it seems recent Res/PointFinder output uses HGVS coordinates.
_seq_var = v.get('seq_var', '')
if _seq_var.startswith('p'):
res.genetic_variation_type = AMINO_ACID_VARIANT # override default set above
_aa_vars.append(_seq_var)
elif _seq_var:
_nt_vars.append(_seq_var)

_cod_chg = v.get('codon_change')
if _cod_chg:
_codon.append(v.get('codon_change'))

# Add the content of the list fields to the bags above
fold(lambda s, e: s.add(e), _phenos, v.get('phenotypes', []))
fold(lambda s, e: s.add(e), _notes, v.get('notes', []))
fold(lambda s, e: s.add(e), _pmids, v.get('pmids', []))

# We have collected all variations on region r, now collapse into fields on res
res.predicted_phenotype = _empty_to_none(", ".join(filter(None, _phenos)))
res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids)
res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars)))
res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars)))
res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codon)) if _codon else None

# --- Do the actual work --- #

# Set the fields that are independent of gene, mutation, phenotype
res.input_file_name = data['software_executions'].copy().popitem()[1]['parameters']['sample_name']
res.analysis_software_name = data['software_name']
res.analysis_software_version = data['software_version']

# We flatten the ResFinder data graph as follows
# - iterate over all phenotypes p (generally: antimicrobials) that have amr_resistant=true
# - iterate over the seq_regions r referenced by p (generally: resistance genes)
# - for each r report a GENE_PRESENCE
# - group the seq_variations referenced by p by the seq_region r they lie on
# - iterate over the regions r
# - for each r report one AMINO_ACID_VARIANT record, collapsing the seq_variations
for p in filter(lambda d: d.get('amr_resistant', False), data['phenotypes'].values()):

# Set the fields available on phenotype object
res.drug_class = ", ".join(p.get('amr_classes', []))
res.antimicrobial_agent = p.get('amr_resistance', "unspecified")
res.reference_database_name = _get_db_name(p.get('ref_database'))
res.reference_database_version = _get_db_ver(p.get('ref_database'))

# Iterate r over the regions (AMR genes) referenced by p, and yield each in turn
for r in map(lambda k: data['seq_regions'][k], p.get('seq_regions', [])):

res.genetic_variation_type = GENE_PRESENCE
set_shared_fields(r)

# Yield a new hAMRonizedResult ours using super's method as that may do the needful
yield self.hAMRonize(None, res.__dict__)

# Collect the list of seq_variations (if any) referenced from phenotype p,
# and the set of regions that these mutations lie on, so that we iterate
# these regions and "collapse" all mutations for that region onto one record
vs = list(map(lambda k: data['seq_variations'][k], p.get('seq_variations', [])))
rs = set(fold(lambda a, v: a + v.get('seq_regions', []), [], vs))

# Iterate r over each region referenced by some set of variations, and yield each
for r in map(lambda k: data['seq_regions'][k], rs):

res.genetic_variation_type = NUCLEOTIDE_VARIANT # default may be overridden
set_shared_fields(r)
set_variation_fields(r, vs)

# Yield a new hAMRonizedResult ours using super's method as that may do the needful
yield self.hAMRonize(None, res.__dict__)


# Miscellaneous little helper functions to keep the above uncluttered

def _get_start_pos(p0, p1):
return None if not (type(p0) is int and type(p1) is int) else p0 if p0 <= p1 else p1


def _get_end_pos(p0, p1):
return None if not (type(p0) is int and type(p1) is int) else p1 if p1 >= p1 else p0


def _get_strand(p0, p1):
return None if not (type(p0) is int and type(p1) is int) else '+' if p0 <= p1 else '-'


def _get_length(p0, p1):
return None if not (type(p0) is int and type(p1) is int) else abs(p1 - p0) + 1


def _safe_round(n, d):
return None if n is None else round(n, d)


def _empty_to_none(s):
return s if len(s) else None


def _condense_notes(notes, pmids):
"""Helper to condense notes and PMID references into a single line"""
lines = []
lines += filter(None, notes)
pmids = list(filter(None, pmids))
if pmids:
lines.append("PMIDs: " + ", ".join(set(pmids)))
return ". ".join(lines) if lines else None


def fold(fun, acc, lst): # Python's missing function (why have map but not fold?)
"""Left fold iterable `lst` onto `acc` by iterating `acc = fun(acc, x)`"""
for x in lst:
acc = fun(acc, x)
return acc
4 changes: 2 additions & 2 deletions hAMRonization/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

__version__ = "1.1.7"
__version__ = "1.1.8"

from hAMRonization import AbricateIO
from hAMRonization import AmrFinderPlusIO
Expand Down Expand Up @@ -47,7 +47,7 @@
"amrfinderplus": "OUTPUT.tsv",
"ariba": "OUTDIR/OUTPUT.tsv",
"rgi": "OUTPUT.txt or OUTPUT_bwtoutput.gene_mapping_data.txt",
"resfinder": "ResFinder_results_tab.txt",
"resfinder": "data_resfinder.json",
"srax": "sraX_detected_ARGs.tsv",
"deeparg": "OUTDIR/OUTPUT.mapping.ARG",
"kmerresistance": "OUTPUT.res",
Expand Down
5 changes: 5 additions & 0 deletions test/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
/hamronized_*.json
/hamronized_*.tsv
/summary.html
/summary.json
/summary.tsv
2 changes: 0 additions & 2 deletions test/data/dummy/resfinder/ResFinder_results_tab.txt

This file was deleted.

1 change: 1 addition & 0 deletions test/data/dummy/resfinder/data_resfinder.json

Large diffs are not rendered by default.

2 changes: 0 additions & 2 deletions test/data/dummy/resfinder/resfinder_inputfastq_inputfasta.txt

This file was deleted.

1 change: 1 addition & 0 deletions test/data/raw_outputs/resfinder/data_resfinder.json

Large diffs are not rendered by default.

15 changes: 0 additions & 15 deletions test/data/raw_outputs/resfinder/resfinder_inputfasta_tab.txt

This file was deleted.

This file was deleted.

14 changes: 0 additions & 14 deletions test/data/raw_outputs/resfinder/resfinder_inputfastq_tab.txt

This file was deleted.

Loading

0 comments on commit 1ee9d42

Please sign in to comment.