Skip to content

Commit

Permalink
add split at rank and bump version
Browse files Browse the repository at this point in the history
  • Loading branch information
maxibor committed Jul 1, 2022
1 parent 2c42f54 commit eebfc83
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 3 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ All notable changes to [sam2lca](https://github.com/maxibor/sam2lca) will be doc
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 1.1.0

- Added the option to split bam ouput file by TAXID at specified rank

## 1.0.0

### Added
Expand Down
2 changes: 1 addition & 1 deletion sam2lca/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.0.0"
__version__ = "1.1.0"

from sam2lca.main import sam2lca
import logging
Expand Down
15 changes: 15 additions & 0 deletions sam2lca/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import click
from sam2lca import __version__
from sam2lca.main import sam2lca, update_database, list_available_db
from sam2lca.utils import taxo_ranks
from pathlib import Path


Expand Down Expand Up @@ -131,6 +132,20 @@ def cli(ctx, dbdir):
is_flag=True,
help="Write BAM output file with XT tag for TAXID",
)
@click.option(
"-r",
"--bam_split_rank",
type=click.Choice(taxo_ranks, case_sensitive=False),
help="Write BAM output file split by TAXID at rank -r. To use in combination with -b/--bam_out",
)
@click.option(
"-n",
"--bam_split_read",
type=int,
default=50,
show_default=True,
help="Minimum numbers of reads to write BAM split by TAXID. To use in combination with -b/--bam_out",
)
def analyze(ctx, no_args_is_help=True, **kwargs):
"""\b
Run the sam2lca analysis
Expand Down
36 changes: 35 additions & 1 deletion sam2lca/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
from sam2lca.lca import compute_lca
from sam2lca import utils
from sam2lca.taxonomy import setup_taxopy_db, load_taxonomy_db
from sam2lca.write_alignment import write_bam_tags
from sam2lca.write_alignment import write_bam_tags, write_bam_by_taxid
from pathlib import Path
import logging
from multiprocessing import cpu_count
from tqdm.contrib.concurrent import thread_map
from functools import partial
import sys
import os
from pprint import pprint


def sam2lca(
Expand All @@ -23,6 +26,8 @@ def sam2lca(
length=30,
conserved=False,
bam_out=False,
bam_split_rank=False,
bam_split_read=50,
):
"""Performs LCA on SAM/BAM/CRAM alignment file
Expand All @@ -37,6 +42,8 @@ def sam2lca(
edit_distance(int): Maximum edit distance threshold
length(int): Minimum alignment length
bam_out(bool): Write BAM output file with XT tag for TAXID
bam_split_rank(str): Rank to split BAM output file by TAXID
bam_split_read(int): Minimum number of reads to split BAM output file by TAXID
"""
nb_steps = 8 if conserved else 7
process = cpu_count() if process == 0 else process
Expand Down Expand Up @@ -88,6 +95,33 @@ def sam2lca(
nb_steps=nb_steps,
taxo_db=TAXDB,
)


if bam_out and bam_split_rank:
logging.info(f"* BAM files split by TAXID at the {bam_split_rank} level")
taxids_list = utils.filter_taxid_info_dict(
taxid_info_dict,
rank=bam_split_rank,
minread=bam_split_read
).keys()
write_bam_by_taxid_partial = partial(
write_bam_by_taxid,
infile=sam,
outfile=output["bam"],
total_reads = al.total_reads,
read_taxid_dict=reads_taxid_dict,
acc2tax_dict=al.acc2tax,
taxid_info_dict=taxid_info_dict,
identity=identity,
edit_distance=distance,
minlength=length,
)
thread_map(write_bam_by_taxid_partial,
taxids_list,
chunksize=1,
max_workers=process,
)

if bam_out:
write_bam_tags(
infile=sam,
Expand Down
20 changes: 20 additions & 0 deletions sam2lca/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,16 @@

warnings.simplefilter(action="ignore", category=FutureWarning)

taxo_ranks = [
'strain',
'species',
'genus',
'family',
'order',
'class',
'phylum',
'superkingdom'
]

def count_reads_taxid(read_taxid_dict):
"""Returns number of reads matching TAXID
Expand Down Expand Up @@ -103,17 +113,20 @@ def taxid_to_lineage_single(taxid_items, taxid_info_dict, taxo_db):
sciname = taxon.name
rank = taxon.rank
lineage = taxon.rank_name_dictionary
taxid_lineage = taxon.taxid_lineage
except Exception:
sciname = "unclassified sequences"
rank = "NA"
lineage = "NA"
taxid_lineage = "NA"

taxid_info_dict[taxid] = {
"name": sciname,
"rank": rank,
"count_taxon": read_count_taxon,
"count_descendant": read_count_descendant,
"lineage": lineage,
"lineage_taxid": taxid_lineage,
}


Expand Down Expand Up @@ -165,6 +178,13 @@ def taxid_to_lineage(taxid_count_dict, output, process, nb_steps, taxo_db):
return taxid_info_dict


def filter_taxid_info_dict(taxid_info_dict, rank, minread):
taxid_info_dict_filtered = dict()
for entry in taxid_info_dict:
if taxid_info_dict[entry]['rank'] == rank and taxid_info_dict[entry]['count_descendant'] >= minread:
taxid_info_dict_filtered[entry] = taxid_info_dict[entry]
return taxid_info_dict_filtered

def get_db_connection(db_path):
return rocksdb.DB(db_path, opts=rocksdb.Options(), read_only=True)

Expand Down
90 changes: 90 additions & 0 deletions sam2lca/write_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,96 @@
from pysam import AlignmentFile
from tqdm import tqdm

def write_bam_by_taxid(
target_taxid,
infile,
outfile,
total_reads,
read_taxid_dict,
acc2tax_dict,
taxid_info_dict,
identity,
edit_distance,
minlength
):
"""Write alignment file with taxonomic information tag
Args:
target_taxid (int): Taxid of target taxon
infile (str): Path to alignment file
outfile (str): Path to output file
total_reads(int): Total number of reads in file
read_taxid_dict (dict): Dictionary of read with their corresponding LCA taxid
taxid_info_dict(dict): Dictionary of taxid with their count and metadata
acc2tax_dict(dict): Accession to taxid dictionary
identity (float): Minimum identity for alignment
edit_distance (int): Minimum edit distance for alignment
minlength (int): Minimum length for alignment
"""
mode = {"sam": "r", "bam": "rb", "cram": "rc"}
filetype = infile.split(".")[-1]
outfile = f"{outfile.replace('.sam2lca.bam','')}_taxid_{target_taxid}.sam2lca.bam"

with AlignmentFile(infile, mode[filetype]) as samfile:
reflens = dict()

for ref in samfile.references:
try:
if target_taxid in acc2tax_dict[ref].taxid_lineage:
reflens[ref] = samfile.get_reference_length(ref)
except KeyError:
continue

header = {
"HD": {"VN": "1.0", "SO": "unsorted"},
"SQ": [
{"LN": reflens[refname], "SN": refname} for refname in reflens.keys()
]
}
refid_dict = dict()
for i, ref in enumerate(reflens.keys()):
refid_dict[ref] = i
with AlignmentFile(outfile, "wb", header=header) as outfile:
for read in tqdm(samfile, desc=str(target_taxid), unit="reads", total=total_reads):
if read.has_tag("NM") and not read.is_unmapped:
mismatch = read.get_tag("NM")
alnLen = read.query_alignment_length
readLen = read.query_length
ident = (alnLen - mismatch) / readLen
if edit_distance:
threshold = edit_distance
align_value = mismatch
else:
threshold = 1 - identity
align_value = 1 - ((alnLen - mismatch) / readLen)
if align_value <= threshold and alnLen >= minlength:
try:
taxid = read_taxid_dict[read.query_name]
if target_taxid in taxid_info_dict[taxid]['lineage_taxid']:
read.set_tag("XT", taxid, "i")
read.set_tag(
"XN",
taxid_info_dict[taxid]["name"],
"Z",
)
read.set_tag(
"XR",
taxid_info_dict[taxid]["rank"],
"Z",
)
read.reference_id = refid_dict[read.reference_name]
try:
outfile.write(read)
except ValueError:
logging.error(f"Error writing read {read.query_name}")
continue
except KeyError:
# print(read, read.reference_name, read.reference_id)
continue

outfile.close()
samfile.close()


def write_bam_tags(
infile,
Expand Down
2 changes: 1 addition & 1 deletion tests/data/results/aligned.sorted.sam2lca.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"543": {"name": "Enterobacteriaceae", "rank": "family", "count_taxon": 2152, "count_descendant": 2875, "lineage": {"family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "91347": {"name": "Enterobacterales", "rank": "order", "count_taxon": 0, "count_descendant": 2875, "lineage": {"order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "1236": {"name": "Gammaproteobacteria", "rank": "class", "count_taxon": 0, "count_descendant": 2875, "lineage": {"class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "1224": {"name": "Proteobacteria", "rank": "phylum", "count_taxon": 0, "count_descendant": 2875, "lineage": {"phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "2": {"name": "Bacteria", "rank": "superkingdom", "count_taxon": 0, "count_descendant": 2875, "lineage": {"superkingdom": "Bacteria"}}, "131567": {"name": "cellular organisms", "rank": "no rank", "count_taxon": 0, "count_descendant": 2875, "lineage": {}}, "1": {"name": "root", "rank": "no rank", "count_taxon": 0, "count_descendant": 2875, "lineage": {}}, "511145": {"name": "Escherichia coli str. K-12 substr. MG1655", "rank": "no rank", "count_taxon": 385, "count_descendant": 385, "lineage": {"strain": "Escherichia coli K-12", "species": "Escherichia coli", "genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "83333": {"name": "Escherichia coli K-12", "rank": "strain", "count_taxon": 0, "count_descendant": 385, "lineage": {"strain": "Escherichia coli K-12", "species": "Escherichia coli", "genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "562": {"name": "Escherichia coli", "rank": "species", "count_taxon": 0, "count_descendant": 385, "lineage": {"species": "Escherichia coli", "genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "561": {"name": "Escherichia", "rank": "genus", "count_taxon": 0, "count_descendant": 385, "lineage": {"genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "300267": {"name": "Shigella dysenteriae Sd197", "rank": "strain", "count_taxon": 338, "count_descendant": 338, "lineage": {"strain": "Shigella dysenteriae Sd197", "species": "Shigella dysenteriae", "genus": "Shigella", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "622": {"name": "Shigella dysenteriae", "rank": "species", "count_taxon": 0, "count_descendant": 338, "lineage": {"species": "Shigella dysenteriae", "genus": "Shigella", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}, "620": {"name": "Shigella", "rank": "genus", "count_taxon": 0, "count_descendant": 338, "lineage": {"genus": "Shigella", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}}}
{"543": {"name": "Enterobacteriaceae", "rank": "family", "count_taxon": 2152, "count_descendant": 2875, "lineage": {"family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [543, 91347, 1236, 1224, 2, 131567, 1]}, "91347": {"name": "Enterobacterales", "rank": "order", "count_taxon": 0, "count_descendant": 2875, "lineage": {"order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [91347, 1236, 1224, 2, 131567, 1]}, "1236": {"name": "Gammaproteobacteria", "rank": "class", "count_taxon": 0, "count_descendant": 2875, "lineage": {"class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [1236, 1224, 2, 131567, 1]}, "1224": {"name": "Proteobacteria", "rank": "phylum", "count_taxon": 0, "count_descendant": 2875, "lineage": {"phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [1224, 2, 131567, 1]}, "2": {"name": "Bacteria", "rank": "superkingdom", "count_taxon": 0, "count_descendant": 2875, "lineage": {"superkingdom": "Bacteria"}, "lineage_taxid": [2, 131567, 1]}, "131567": {"name": "cellular organisms", "rank": "no rank", "count_taxon": 0, "count_descendant": 2875, "lineage": {}, "lineage_taxid": [131567, 1]}, "1": {"name": "root", "rank": "no rank", "count_taxon": 0, "count_descendant": 2875, "lineage": {}, "lineage_taxid": [1]}, "511145": {"name": "Escherichia coli str. K-12 substr. MG1655", "rank": "no rank", "count_taxon": 385, "count_descendant": 385, "lineage": {"strain": "Escherichia coli K-12", "species": "Escherichia coli", "genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [511145, 83333, 562, 561, 543, 91347, 1236, 1224, 2, 131567, 1]}, "83333": {"name": "Escherichia coli K-12", "rank": "strain", "count_taxon": 0, "count_descendant": 385, "lineage": {"strain": "Escherichia coli K-12", "species": "Escherichia coli", "genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [83333, 562, 561, 543, 91347, 1236, 1224, 2, 131567, 1]}, "562": {"name": "Escherichia coli", "rank": "species", "count_taxon": 0, "count_descendant": 385, "lineage": {"species": "Escherichia coli", "genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [562, 561, 543, 91347, 1236, 1224, 2, 131567, 1]}, "561": {"name": "Escherichia", "rank": "genus", "count_taxon": 0, "count_descendant": 385, "lineage": {"genus": "Escherichia", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [561, 543, 91347, 1236, 1224, 2, 131567, 1]}, "300267": {"name": "Shigella dysenteriae Sd197", "rank": "strain", "count_taxon": 338, "count_descendant": 338, "lineage": {"strain": "Shigella dysenteriae Sd197", "species": "Shigella dysenteriae", "genus": "Shigella", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [300267, 622, 620, 543, 91347, 1236, 1224, 2, 131567, 1]}, "622": {"name": "Shigella dysenteriae", "rank": "species", "count_taxon": 0, "count_descendant": 338, "lineage": {"species": "Shigella dysenteriae", "genus": "Shigella", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [622, 620, 543, 91347, 1236, 1224, 2, 131567, 1]}, "620": {"name": "Shigella", "rank": "genus", "count_taxon": 0, "count_descendant": 338, "lineage": {"genus": "Shigella", "family": "Enterobacteriaceae", "order": "Enterobacterales", "class": "Gammaproteobacteria", "phylum": "Proteobacteria", "superkingdom": "Bacteria"}, "lineage_taxid": [620, 543, 91347, 1236, 1224, 2, 131567, 1]}}

0 comments on commit eebfc83

Please sign in to comment.