-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcheck_megaplasmids.py
107 lines (84 loc) · 4.39 KB
/
check_megaplasmids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
"""
This script checks the plasmid references for megaplasmids, i.e. plasmids that have more than 450kbp. Both,
non-megaplasmids and megaplasmids, are plotted with regard to their length. Finally, we extract all megaplasmids that
have a certain similarity to our chromosome contigs and check them manually afterwards. Actually, we only found two such
megaplasmids.
Note: Due to our manual checks, the megaplasmid with the ID "LR214949.1" and a length of 843,827 was removed from the
positive reference data as it is a falsely classified. It is part of the anomalous and contaminated assembly
"GCF_900660475.1" (see https://www.ncbi.nlm.nih.gov/assembly/GCF_900660475.1) but nevertheless included in the PLSDB.
"""
import click
import glob
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import sourmash
from Bio import SeqIO
def get_seq_len(dir):
lengths = dict()
for file in glob.glob(f'{dir}/*.fasta'):
with open(file, 'r') as f:
for record in SeqIO.parse(f, 'fasta'):
if file not in lengths:
lengths[file] = len(record.seq)
else:
lengths[file] += len(record.seq)
return lengths
def plot_distribution(plotdata, title, figure_dir, bins):
_, ax = plt.subplots(figsize=(16, 8))
sns.histplot(plotdata, x='Length', ax=ax, bins=bins)
plt.title(f'Distribution of {title} Lengths', fontsize=22)
plt.ticklabel_format(style='plain', axis='x')
plt.xlim(0, plotdata['Length'].max())
plt.xlabel('Length', fontsize=18)
plt.ylabel('Count', fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()
plt.savefig(f'{figure_dir}/{title.lower()}_lengths.png', dpi=300, facecolor='white', edgecolor='none')
def calculate_signatures(input_dir, megaplasmids=None):
signatures = list()
if megaplasmids:
files = [f'{input_dir}/{mp}' for mp in megaplasmids]
else:
files = glob.glob(f'{input_dir}/*.fasta')
for fasta_file in files:
with open(fasta_file, 'r') as f:
for record in SeqIO.parse(f, 'fasta'):
mh = sourmash.MinHash(n=0, ksize=31, scaled=100)
mh.add_sequence(str(record.seq), force=True)
signatures.append((fasta_file, mh, record.id, len(record.seq)))
return signatures
@click.command()
@click.option('--ref_neg_dir', '-n', type=click.Path(exists=True), required=True)
@click.option('--ref_pos_dir', '-p', type=click.Path(exists=True), required=True)
@click.option('--figure_dir', '-f', type=click.Path(exists=True), required=True)
@click.option('--sim_threshold', '-t', default=0.9)
def main(ref_neg_dir, ref_pos_dir, sim_threshold, figure_dir):
species_lengths = get_seq_len(ref_neg_dir)
plasmid_lengths = pd.DataFrame.from_dict(get_seq_len(ref_pos_dir))
plasmid_lengths.to_csv(f'{ref_pos_dir}/plasmid_lengths.csv', index=False)
megaplasmids = plasmid_lengths[plasmid_lengths['Length'] >= 450000]
other_plasmids = plasmid_lengths[plasmid_lengths['Length'] < 450000]
megaplasmids.to_csv(f'{ref_pos_dir}/megaplasmid_lengths.csv', index=False)
plot_distribution(megaplasmids, 'Megaplasmid', figure_dir, 50)
plot_distribution(other_plasmids, 'Non-Megaplasmid', figure_dir, 'auto')
ref_neg_sigs = calculate_signatures(ref_neg_dir)
megaplasmid_sigs = calculate_signatures(ref_pos_dir, megaplasmids)
df = pd.DataFrame(columns=['Plasmid', 'Plasmid Length', 'Similarity', 'Species', 'Species Length', 'Contig',
'Contig Length'])
for p_path, p_hash, _, p_len in megaplasmid_sigs:
for n_path, n_hash, n_id, n_len in ref_neg_sigs:
sim = n_hash.similarity(p_hash, downsample=True)
if sim >= sim_threshold:
df = pd.concat([df, pd.DataFrame([{'Plasmid': os.path.basename(p_path),
'Plasmid Length': p_len,
'Similarity': sim,
'Species': os.path.basename(n_path),
'Species Length': species_lengths[n_path],
'Contig': n_id,
'Contig Length': n_len}])])
df.to_csv(f'{ref_pos_dir}/similar_megaplasmids.csv', index=False)
if __name__ == '__main__':
main()