forked from yanailab/CEL-Seq-pipeline
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbowtie_wrapper.py
76 lines (69 loc) · 2.94 KB
/
bowtie_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/python2
""" run bowtie with specified parameter file
"""
import subprocess
from logging import getLogger
import os
import shlex
import argparse
import csv
from glob import glob
from multiprocessing.pool import ThreadPool
logger = getLogger('pijp.bowtie_wrapper')
def build_bowtie_command(fastq_file, index_file, number_of_threads, output_dir, extra_params):
base_fastq = os.path.splitext(os.path.basename(fastq_file))[0]
samfile = os.path.join(output_dir, base_fastq + ".sam")
## no-hd means no header lines.
## -p is for the number of rows.
bowtie_cmd = "bowtie2 -p {0} {1} -x {2} -U {3} -S {4} ".format(number_of_threads, extra_params, index_file, fastq_file, samfile)
return bowtie_cmd
def run_cmd((cmd,fastq_file)):
""" Run the command, and return the stderr.
"""
logger.info("ran : " + cmd)
pro = subprocess.Popen(cmd, shell=True, stderr = subprocess.PIPE)
(x, stderr) = pro.communicate()
assert (pro.returncode == 0 ), "bowtie error %d : %s" % (pro.returncode, stderr)
new_row = ( [fastq_file] + get_stats(stderr.splitlines()) )
logger.info("finished : " + cmd)
return new_row
def main(input_files, index_file, number_of_threads, output_dir, bowtie_report_name,extra_params,procs=10):
report = []
ht_col2 = []
base_names = []
cmds = []
for fastq_file in input_files:
base_names += [os.path.splitext(os.path.basename(fastq_file))[0]]
# we need base_names for heading the matrix file.
bt2_cmd = build_bowtie_command(fastq_file, index_file, number_of_threads, output_dir, extra_params)
cmds.append((bt2_cmd,fastq_file))
ht_col1 = "\n".join(base_names)
pool = ThreadPool(int(procs))
results = pool.map(run_cmd, cmds)
for res in results:
htout = "\t".join(res)
ht_col2.append(htout)
matrix_header = "\t".join(['#sample','total', 'not_aligned', 'aligned_once', 'multi_aligned', '% mapped']) + "\n"
matrix = "\n".join(ht_col2)
filename = os.path.join(output_dir, bowtie_report_name)
with open(filename, 'wb') as f:
f.write(matrix_header)
f.write(matrix)
def get_stats(bt_stderr):
## Write all warnings to log, and skip them.
# For millions of warnings this takes too long..
#while bt_stderr[0].startswith("Warning"):
# logger.info("BOWTIE : " + bt_stderr[0])
# bt_stderr.pop(0)
if len(bt_stderr) > 5:
logger.info("Bowtie probably had {0} warnings [e.g. read too small]".format(len(bt_stderr)-4))
bt_stderr = bt_stderr[-5:]
if int(bt_stderr[0].split()[0]) != 0 :
total = str(int(bt_stderr[0].split()[0]))
not_aligned = str(int(bt_stderr[1].split()[0]))
aligned = str(int(bt_stderr[2].split()[0]))
multi_aligned = str(int(bt_stderr[3].split()[0]))
mapped_percent = bt_stderr[4].split()[0]
return [total, not_aligned, aligned, multi_aligned, mapped_percent]
else:
return ['0','0','0','0','0']