-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcalc.pairwise.similarities.py
executable file
·98 lines (89 loc) · 3.69 KB
/
calc.pairwise.similarities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
from argparse import ArgumentParser
from itertools import combinations
from glob import glob
from re import search, DOTALL
from shutil import rmtree
from tempfile import mkdtemp
import os
def parseArgs():
parser = ArgumentParser(description='Computes pairwise percent '
'nucleotide similarities by performing local or global alignments. '
'Requires the blastn binary from the BLAST+ suite, the '
'needle binary from The European Molecular Biology '
'Open Software Suite (EMBOSS) or the ggsearch36 binary from '
'the FASTA package suite.', add_help=False)
req = parser.add_argument_group('Required')
req.add_argument('-i', '--inpath', required=True, metavar='PATH',
help='path containing input files')
opt = parser.add_argument_group('Optional')
opt.add_argument('-a', '--aligner', choices=['blastn', 'ggsearch36',
'needle'], default='blastn', help='binary name to use for local or '
'global alignment [blastn]')
opt.add_argument('-c', '--cpus', type=int, metavar='INT',
default=1, help='number of CPUs to use [1]')
opt.add_argument('-e', '--ext', type=str, metavar='STR', default='fasta',
help='file extension to find all nucleotide FastA files within the '
'specified inpath [fasta]')
opt.add_argument('-h', '--help', action='help',
help='show this help message and exit')
opt.add_argument('-o', '--outfile', metavar='FILE',
default='Pairwise.Similarities.tab',
help='output file [./Pairwise.Similarities.tab]')
opt.add_argument('-p', '--pref', type=str, metavar='STR', default='',
help='file prefix to find all nucleotide FastA files within the '
'specified inpath [*]')
return parser.parse_args()
def main():
opts = parseArgs()
aligner = opts.aligner
inpath = os.path.abspath(opts.inpath)
ext = opts.ext
pref = opts.pref
tmp = mkdtemp()
outfile = os.path.abspath(opts.outfile)
# Get sorted unique list of tuples
files = glob(os.path.join(inpath, '{}*.{}'.format(pref, ext)))
uniq = set()
for i in combinations(files, 2):
if i[0] != i[1]:
uniq.add(i)
sort_uniq_filepairs = sorted(uniq, key=lambda tupl:tupl[0])
with open(outfile, 'w') as ofh:
for i, j in sort_uniq_filepairs:
b1 = os.path.basename(i).rstrip('.' + ext)
b2 = os.path.basename(j).rstrip('.' + ext)
tmpfile = os.path.join(tmp, b1 + ',' + b2 + '.similarity')
# Run global alignments
if aligner == 'needle':
os.system('needle -asequence {} -snucleotide1 -bsequence '
'{} -snucleotide2 -gapopen 10.0 -gapextend 0.5 -outfile '
'{} -aformat3 markx2 -awidth3 60 -auto -error'.format(
i, j, tmpfile))
elif aligner == 'ggsearch36':
os.system('ggsearch36 -T {} -b 1 -f 10 -g 0.5 -n -m 0 -O {} -w 80 '
'{} {} > {}'.format(opts.cpus, tmpfile, i, j, os.devnull))
elif aligner == 'blastn':
tmp_b1 = os.path.join(tmp, b1)
os.system('makeblastdb -dbtype nucl -in {} -out {} '
'> {}'.format(i, tmp_b1, os.devnull))
os.system('blastn -num_threads {} -task blastn -outfmt \"6 ppos\" '
'-db {} -query {} -out {} > {}'.format(opts.cpus, tmp_b1, j,
tmpfile, os.devnull))
# Parse percent similarity values
dat = open(tmpfile).readlines()
if aligner == 'needle': #(Version: EMBOSS:6.6.0.0)
f = search(r'#\s+Similarity:\s+\d+\/\d+\s+\(\d+.\d+\%\)\n',
''.join(dat))
similarity = f.group(0).split('(')[-1].rstrip('%)\n')
elif aligner == 'ggsearch36': #GGSEARCH 36.3.7 Oct, 2014preload9
f = search(r'global\/global.+similar.+\n',
''.join(dat), flags=DOTALL)
similarity = f.group(0).split('% identity (')[-1].split(
'% similar)')[0]
elif aligner == 'blastn': #v2.2.31+
similarity = dat[0].rstrip('\n')
ofh.write('{}\t{}\t{}\n'.format(b1, b2, similarity))
rmtree(tmp)
if __name__ == '__main__':
main()