-
Notifications
You must be signed in to change notification settings - Fork 30
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #94 from zwets/upgrade-resfinder
Upgrade ResFinder parser to 4.6.0
- Loading branch information
Showing
13 changed files
with
324 additions
and
151 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,87 +1,220 @@ | ||
#!/usr/bin/env python | ||
|
||
import csv | ||
import json | ||
from .Interfaces import hAMRonizedResultIterator | ||
from hAMRonization.constants import GENE_PRESENCE | ||
from .hAMRonizedResult import hAMRonizedResult | ||
from hAMRonization.constants import (GENE_PRESENCE, NUCLEOTIDE_VARIANT, AMINO_ACID_VARIANT) | ||
|
||
# No required metadata, everything is in the JSON | ||
required_metadata = [ | ||
"analysis_software_version", | ||
"reference_database_version", | ||
"input_file_name", | ||
] | ||
|
||
|
||
# This class is the parser instantiated by the hAMRonize framework to parse | ||
# ResFinder output. Its parse() method must return a hAMRonizedResultIterator | ||
# whose next() method yields the sequence of hAMRonizedResults. | ||
# The confusing bit is that the parser and iterator classes have been smushed | ||
# into a single abstract base class, forcing our class to be both the creator | ||
# of the iterator and the iterator itself. | ||
# | ||
class ResFinderIterator(hAMRonizedResultIterator): | ||
""" | ||
Updated for ResFinder v4.1 using the `ResFinder_results_tab.txt` output | ||
file | ||
Updated for ResFinder v4.6.0. Processes the `data_resfinder.json` file | ||
which consolidates output from ResFinder, PointFinder, and DisinfFinder. | ||
""" | ||
|
||
def __init__(self, source, metadata): | ||
metadata["reference_database_name"] = "resfinder" | ||
metadata["analysis_software_name"] = "resfinder" | ||
# even though resfinderv4 runs pointfinder | ||
# parsing mutational resistance requires parsing a different file | ||
# to get mutations (pointfinder) | ||
metadata["genetic_variation_type"] = GENE_PRESENCE | ||
self.metadata = metadata | ||
|
||
self.field_mapping = { | ||
"Resistance gene": "gene_symbol", | ||
"Identity": "sequence_identity", | ||
"Alignment Length/Gene Length": None, | ||
"Position in reference": None, | ||
"Contig": "input_sequence_id", | ||
"Position in contig": None, | ||
"Accession no.": "reference_accession", | ||
"Phenotype": "drug_class", | ||
"Coverage": "coverage_percentage", | ||
# decomposed from Position in contig e.g "10432..11277" | ||
"_start": "input_gene_start", | ||
"_stop": "input_gene_stop", | ||
# infered from Position in contig field | ||
"_strand": "strand_orientation", | ||
# Resistance gene is mapped to both symbol and name | ||
"_gene_name": "gene_name", | ||
# decomposed from Alignment Length/Gene Length e.g 1176/1176 | ||
"_reference_gene_length": "reference_gene_length", | ||
} | ||
|
||
super().__init__(source, self.field_mapping, self.metadata) | ||
|
||
# We don't use metadata or field mapping so can just defer to super, | ||
# which will open source and invoke our parse method with the open stream. | ||
super().__init__(source, dict(), metadata) | ||
|
||
def parse(self, handle): | ||
""" | ||
Read each and return it | ||
Implements abstract method parse which gets called from __init__ with | ||
the opened source in handle, and needs to set self up as an iterator | ||
that yields the list of results. | ||
""" | ||
reader = csv.DictReader(handle, delimiter="\t") | ||
for result in reader: | ||
result["_gene_name"] = result["Resistance gene"] | ||
# Removes fields that have NA and NA..NA this is typical in the 4.5.0 resfinder when the --inputfastq is implimented | ||
for field, value in result.items(): | ||
if value == "NA": | ||
result[field] = None | ||
if value == "NA..NA": | ||
result[field] = None | ||
# If result["Position in contig"] == None then make _start, _stop and _strand = None | ||
if result["Position in contig"] != None: | ||
_start, _stop = result["Position in contig"].split("..") | ||
if _start > _stop : | ||
_strand = "-" | ||
else: | ||
_strand = "+" | ||
|
||
result["_start"] = _start | ||
result["_stop"] = _stop | ||
result["_strand"] = _strand | ||
else: | ||
_strand = None | ||
result["_start"] = None | ||
result["_stop"] = None | ||
result["_strand"] = None | ||
|
||
_reference_gene_length = result["Alignment Length/Gene Length"].split("/")[ | ||
0 | ||
] | ||
result["_reference_gene_length"] = _reference_gene_length | ||
yield self.hAMRonize(result, self.metadata) | ||
result = {} | ||
|
||
# Result to be returned, with mandatory initialisations of the 9 "positional" args | ||
res = hAMRonizedResult('', '', '', '', '', '', '', '', '') | ||
|
||
# Input data read in ResFinder 4.2+ JSON format. This has three main elements: | ||
# - seq_regions: loci/genes that were found, keying into 0 or more phenotypes | ||
# - seq_variations: mutations that key into a seq_region and 0 or more phenotypes | ||
# - phenotypes: antimicriobals keying back into the above objects | ||
data = json.load(handle) | ||
|
||
# Helpers to fetch database names and versions from the JSON data | ||
_dbs = data['databases'] | ||
_unk_db = {'database_name': "ref-error", 'database_version': "ref-error"} | ||
|
||
def _get_db_name(keys): | ||
return _dbs.get(keys[0], _unk_db).get('database_name', "no-name") if keys else "unspecified" | ||
|
||
def _get_db_ver(keys): | ||
return _dbs.get(keys[0], _unk_db).get('database_version', "no-version") if keys else "unspecified" | ||
|
||
# Setter for the hAMRonizedResult fields related to genes | ||
def set_shared_fields(r): | ||
"""Sets all fields in res that relate to region r. Applies to both gene and mutation results.""" | ||
|
||
# mandatory | ||
res.gene_symbol = r.get('name', "unspecified") | ||
res.gene_name = r.get('name', "unspecified") | ||
res.reference_accession = r.get('ref_acc', r.get('ref_id', r.get('key', "unknown"))) | ||
|
||
# optional | ||
res.coverage_percentage = _safe_round(r.get('coverage'), 1) | ||
res.coverage_depth = None # we may have this for mutations detected from reads | ||
res.coverage_ratio = r.get('coverage')/100.0 | ||
res.input_sequence_id = r.get('query_id') | ||
res.input_gene_length = _get_length(r.get('query_start_pos'), r.get('query_end_pos')) | ||
res.input_gene_start = _get_start_pos(r.get('query_start_pos'), r.get('query_end_pos')) | ||
res.input_gene_stop = _get_end_pos(r.get('query_start_pos'), r.get('query_end_pos')) | ||
res.strand_orientation = _get_strand(r.get('query_start_pos'), r.get('query_end_pos')) | ||
res.predicted_phenotype = _empty_to_none(", ".join(r.get('phenotypes', []))) | ||
res.predicted_phenotype_confidence_level = _condense_notes(r.get('notes'), r.get('pmids')) | ||
res.reference_gene_length = r.get('ref_seq_length') | ||
res.reference_gene_start = r.get('ref_start_pos') | ||
res.reference_gene_stop = r.get('ref_end_pos') | ||
res.resistance_mechanism = None # This is available in phenotypes.txt but not in the JSON | ||
res.sequence_identity = _safe_round(r.get('identity'), 2) | ||
|
||
# Zap these by default, will only be set in seq_variations below | ||
res.amino_acid_mutation = None | ||
res.amino_acid_mutation_interpretation = None | ||
res.nucleotide_mutation = None | ||
res.nucleotide_mutation_interpretation = None | ||
|
||
# Setter for the hAMRonizedResult fields related to mutations | ||
def set_variation_fields(r, vs): | ||
"""Sets the mutation-specific fields in res, aggregating from all variations in vs on region r.""" | ||
|
||
# Bags to collect variations, phenotypes and notes across the variations | ||
_aa_vars = list() | ||
_nt_vars = list() | ||
_codon = list() | ||
_phenos = set() | ||
_notes = set() | ||
_pmids = set() | ||
|
||
# Iterate v over the variations in vs that lie on region r in order of their position | ||
# (variation->regions strangely is a list, so we need to check if r.key is in it) | ||
for v in sorted(filter(lambda v: r['key'] in v.get('seq_regions', []), vs), | ||
key=lambda v: v.get('ref_start_pos', 0)): | ||
|
||
# May need refinement to properly accommodate inserts and deletes, | ||
# though it seems recent Res/PointFinder output uses HGVS coordinates. | ||
_seq_var = v.get('seq_var', '') | ||
if _seq_var.startswith('p'): | ||
res.genetic_variation_type = AMINO_ACID_VARIANT # override default set above | ||
_aa_vars.append(_seq_var) | ||
elif _seq_var: | ||
_nt_vars.append(_seq_var) | ||
|
||
_cod_chg = v.get('codon_change') | ||
if _cod_chg: | ||
_codon.append(v.get('codon_change')) | ||
|
||
# Add the content of the list fields to the bags above | ||
fold(lambda s, e: s.add(e), _phenos, v.get('phenotypes', [])) | ||
fold(lambda s, e: s.add(e), _notes, v.get('notes', [])) | ||
fold(lambda s, e: s.add(e), _pmids, v.get('pmids', [])) | ||
|
||
# We have collected all variations on region r, now collapse into fields on res | ||
res.predicted_phenotype = _empty_to_none(", ".join(filter(None, _phenos))) | ||
res.predicted_phenotype_confidence_level = _condense_notes(_notes, _pmids) | ||
res.amino_acid_mutation = _empty_to_none(", ".join(filter(None, _aa_vars))) | ||
res.nucleotide_mutation = _empty_to_none(", ".join(filter(None, _nt_vars))) | ||
res.nucleotide_mutation_interpretation = ("Codon changes: " + " ".join(_codon)) if _codon else None | ||
|
||
# --- Do the actual work --- # | ||
|
||
# Set the fields that are independent of gene, mutation, phenotype | ||
res.input_file_name = data['software_executions'].copy().popitem()[1]['parameters']['sample_name'] | ||
res.analysis_software_name = data['software_name'] | ||
res.analysis_software_version = data['software_version'] | ||
|
||
# We flatten the ResFinder data graph as follows | ||
# - iterate over all phenotypes p (generally: antimicrobials) that have amr_resistant=true | ||
# - iterate over the seq_regions r referenced by p (generally: resistance genes) | ||
# - for each r report a GENE_PRESENCE | ||
# - group the seq_variations referenced by p by the seq_region r they lie on | ||
# - iterate over the regions r | ||
# - for each r report one AMINO_ACID_VARIANT record, collapsing the seq_variations | ||
for p in filter(lambda d: d.get('amr_resistant', False), data['phenotypes'].values()): | ||
|
||
# Set the fields available on phenotype object | ||
res.drug_class = ", ".join(p.get('amr_classes', [])) | ||
res.antimicrobial_agent = p.get('amr_resistance', "unspecified") | ||
res.reference_database_name = _get_db_name(p.get('ref_database')) | ||
res.reference_database_version = _get_db_ver(p.get('ref_database')) | ||
|
||
# Iterate r over the regions (AMR genes) referenced by p, and yield each in turn | ||
for r in map(lambda k: data['seq_regions'][k], p.get('seq_regions', [])): | ||
|
||
res.genetic_variation_type = GENE_PRESENCE | ||
set_shared_fields(r) | ||
|
||
# Yield a new hAMRonizedResult ours using super's method as that may do the needful | ||
yield self.hAMRonize(None, res.__dict__) | ||
|
||
# Collect the list of seq_variations (if any) referenced from phenotype p, | ||
# and the set of regions that these mutations lie on, so that we iterate | ||
# these regions and "collapse" all mutations for that region onto one record | ||
vs = list(map(lambda k: data['seq_variations'][k], p.get('seq_variations', []))) | ||
rs = set(fold(lambda a, v: a + v.get('seq_regions', []), [], vs)) | ||
|
||
# Iterate r over each region referenced by some set of variations, and yield each | ||
for r in map(lambda k: data['seq_regions'][k], rs): | ||
|
||
res.genetic_variation_type = NUCLEOTIDE_VARIANT # default may be overridden | ||
set_shared_fields(r) | ||
set_variation_fields(r, vs) | ||
|
||
# Yield a new hAMRonizedResult ours using super's method as that may do the needful | ||
yield self.hAMRonize(None, res.__dict__) | ||
|
||
|
||
# Miscellaneous little helper functions to keep the above uncluttered | ||
|
||
def _get_start_pos(p0, p1): | ||
return None if not (type(p0) is int and type(p1) is int) else p0 if p0 <= p1 else p1 | ||
|
||
|
||
def _get_end_pos(p0, p1): | ||
return None if not (type(p0) is int and type(p1) is int) else p1 if p1 >= p1 else p0 | ||
|
||
|
||
def _get_strand(p0, p1): | ||
return None if not (type(p0) is int and type(p1) is int) else '+' if p0 <= p1 else '-' | ||
|
||
|
||
def _get_length(p0, p1): | ||
return None if not (type(p0) is int and type(p1) is int) else abs(p1 - p0) + 1 | ||
|
||
|
||
def _safe_round(n, d): | ||
return None if n is None else round(n, d) | ||
|
||
|
||
def _empty_to_none(s): | ||
return s if len(s) else None | ||
|
||
|
||
def _condense_notes(notes, pmids): | ||
"""Helper to condense notes and PMID references into a single line""" | ||
lines = [] | ||
lines += filter(None, notes) | ||
pmids = list(filter(None, pmids)) | ||
if pmids: | ||
lines.append("PMIDs: " + ", ".join(set(pmids))) | ||
return ". ".join(lines) if lines else None | ||
|
||
|
||
def fold(fun, acc, lst): # Python's missing function (why have map but not fold?) | ||
"""Left fold iterable `lst` onto `acc` by iterating `acc = fun(acc, x)`""" | ||
for x in lst: | ||
acc = fun(acc, x) | ||
return acc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
/hamronized_*.json | ||
/hamronized_*.tsv | ||
/summary.html | ||
/summary.json | ||
/summary.tsv |
This file was deleted.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
2 changes: 0 additions & 2 deletions
2
test/data/dummy/resfinder/resfinder_inputfastq_inputfasta.txt
This file was deleted.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
15 changes: 0 additions & 15 deletions
15
test/data/raw_outputs/resfinder/resfinder_inputfasta_tab.txt
This file was deleted.
Oops, something went wrong.
14 changes: 0 additions & 14 deletions
14
test/data/raw_outputs/resfinder/resfinder_inputfastq_inputfasta.txt
This file was deleted.
Oops, something went wrong.
14 changes: 0 additions & 14 deletions
14
test/data/raw_outputs/resfinder/resfinder_inputfastq_tab.txt
This file was deleted.
Oops, something went wrong.
Oops, something went wrong.