-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathgapped_contig_joiner.py
executable file
·286 lines (232 loc) · 9.21 KB
/
gapped_contig_joiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#!/usr/bin/env python
"""Stitch contigs together by means of Mummer's nucmer using given
reference. Gaps can optionally be filled with reference instead of Ns
"""
#--- standard library imports
#
import sys
import os
import logging
# optparse deprecated from Python 2.7 on
from optparse import OptionParser
import subprocess
import tempfile
import shutil
# invocation of ipython on exceptions
if False:
import sys, pdb
from IPython.core import ultratb
sys.excepthook = ultratb.FormattedTB(
mode='Verbose', color_scheme='Linux', call_pdb=1)
#--- third-party imports
#
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
#--- project specific imports
#
# /
__author__ = "Andreas Wilm"
__version__ = "0.1"
__email__ = "[email protected]"
__copyright__ = ""
__license__ = ""
__credits__ = [""]
__status__ = ""
sys.stderr.write("DEPRECATION WARNING: use https://github.com/andreas-wilm/simple-contig-joiner\n")
# http://docs.python.org/library/logging.html
LOG = logging.getLogger("")
logging.basicConfig(level=logging.INFO,
format='%(levelname)s [%(asctime)s]: %(message)s')
def cmdline_parser():
"""
creates an OptionParser instance
"""
# http://docs.python.org/library/optparse.html
usage = "%prog: " + __doc__ + "\n" \
"usage: %prog [options]"
parser = OptionParser(usage=usage)
parser.add_option("-v", "--verbose",
action="store_true",
dest="verbose",
help="be verbose")
parser.add_option("", "--debug",
action="store_true",
dest="debug",
help="debugging")
parser.add_option("-c", "--c",
dest="fcontigs",
help="Input file containing contigs to join (fasta)")
parser.add_option("-n", "--dont-fill-with-ref",
dest="dont_fill_with_ref",
action="store_true",
help="Don't fill gaps with reference, but keep Ns isntead")
parser.add_option("-r", "--ref",
dest="fref",
help="Input file containing reference sequence to fill gaps between contigs (fasta)")
parser.add_option("-o", "--output",
dest="fout",
help="output file (fasta)")
parser.add_option("", "--keep-tmp-files",
dest="keep_temp_files",
action="store_true",
default=False,
help="Optional: Don't delete temp (nucmer etc) files")
parser.add_option("", "--tmp-dir",
dest="tmp_dir", # type="string|int|float"
help="Optional: directory to save temp files in")
return parser
def main():
"""
The main function
"""
parser = cmdline_parser()
(opts, args) = parser.parse_args()
if len(args):
parser.error("Unrecognized arguments found: %s." % (
' '.join(args)))
sys.exit(1)
if opts.verbose:
LOG.setLevel(logging.INFO)
if opts.debug:
LOG.setLevel(logging.DEBUG)
if not opts.fref:
parser.error("Reference input file argument missing.")
sys.exit(1)
if not opts.fcontigs:
parser.error("Contig input file argument missing.")
sys.exit(1)
for fname in [opts.fref, opts.fcontigs]:
if not os.path.exists(fname):
LOG.fatal("file '%s' does not exist.\n" % fname)
sys.exit(1)
if not opts.fout:
parser.error("Fasta output file argument missing.")
sys.exit(1)
if os.path.exists(opts.fout):
LOG.fatal("Refusing to overwrite existing file '%s'." % opts.fout)
sys.exit(1)
tmp_files = []
# run mummer's nucmer
#
out_prefix = tempfile.NamedTemporaryFile(
delete=False, dir=opts.tmp_dir).name
fdelta = out_prefix + ".delta"
cmd_args = ['nucmer', opts.fref, opts.fcontigs,
'-p', out_prefix]
process = subprocess.Popen(cmd_args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
(p_stdout, p_stderr) = process.communicate()
assert os.path.exists(fdelta), (
"Couldn't find expected output ('%s') for command: '%s'" % (
fdelta, ' '.join(cmd_args)))
tmp_files.append(fdelta)
LOG.info("Delta written to %s" % fdelta)
# run mummer's show-tiling which creates a pseudo molecule from
# the contigs, filled with Ns
#
fpseudo = fdelta + "_pseudo.fa"
cmd_args = ['show-tiling', fdelta, '-p', fpseudo]
process = subprocess.Popen(cmd_args, stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
(p_stdout, p_stderr) = process.communicate()
assert os.path.exists(fpseudo), (
"Couldn't find expected output ('%s') for command: '%s'" % (
fpseudo, ' '.join(cmd_args)))
tmp_files.append(fpseudo)
LOG.info("Pseudo written to %s" % fpseudo)
if opts.dont_fill_with_ref:
LOG.info(
"Early exit as requested: keeping nucmer's stitched version using N's instead of ref and copying to %s" % (
opts.fout))
shutil.copyfile(fpseudo, opts.fout)
if not opts.keep_temp_files:
for f in tmp_files:
os.unlink(f)
sys.exit(0)
# concatenate the pseudo file and the reference
#
faln_in = fpseudo + "_plus_ref.fa"
fh = open(faln_in, 'w')
for f in [opts.fref, fpseudo]:
seqrecs = list(SeqIO.parse(f, "fasta"))
assert len(seqrecs)==1, (
"Expected exactly one sequence in '%s'" % f)
SeqIO.write(seqrecs, fh, "fasta")
fh.close()
tmp_files.append(faln_in)
# align pseudo file (gapped contigs) and reference
#
faln_out = fpseudo + "_plus_ref_aln.fa"
#cmd_args = ['muscle', '-maxiters', '1', '-in', faln_in, '-out', faln_out]
#process = subprocess.Popen(cmd_args, stdout=subprocess.PIPE,
# stderr=subprocess.PIPE)
cmd = 'mafft --auto --anysymbol %s > %s' % (faln_in, faln_out)
process = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE)
(p_stdout, p_stderr) = process.communicate()
assert os.path.exists(faln_out), (
"Couldn't find expected output ('%s') for command: '%s'" % (
faln_out, ' '.join(cmd_args)))
tmp_files.append(faln_out)
LOG.info("Alignment written to %s" % faln_out)
fh = open(faln_out, "r")
seqrecs = list(SeqIO.parse(fh, "fasta"))
#cleaned_stderr = ''.join(process.stderr.readlines()).replace("WARNING : Unknown character n\r", "")
fh.close()
assert len(seqrecs)==2, (
"Expected exactly two sequences in alignment %s but got %d." % (
faln_out, len(seqrecs)))
# now walk through the pairwise alignment. keep the pseudo contig
# and use the reference only in case of gaps (marked as Ns) or
# other ambiguity characters
#
stitched_seq = []
# both seqrecs are aligned
(ref_seqrec, pseudo_seqrec) = sorted(seqrecs,
key = lambda s: s.id.startswith("pseudo_"))
for pos in range(len(pseudo_seqrec.seq)):
pseudo_nt = pseudo_seqrec.seq[pos]
ref_nt = ref_seqrec.seq[pos]
# N: "real" N in contig or to be replaced with reference
# internal gap: deletion with respect to reference
# terminal gap: missing fragment
# delay gap handling therefore
# tilde in contigs will be kept as indel
if pseudo_nt == 'N' or pseudo_nt == "-":
stitched_seq.append(ref_nt)
elif pseudo_nt.upper() not in 'ACGTU~' and ref_nt.upper() in 'ACGTU':
stitched_seq.append(ref_nt)
else:
stitched_seq.append(pseudo_nt)
# delayed: gap handling: substitute terminal ones with refseq and
# replace internal ones
stitched_seq = ''.join(stitched_seq)
tmp_len = len(stitched_seq)
stitched_seq = stitched_seq.lstrip("-")
num_gaps_start = tmp_len - len(stitched_seq)
tmp_len = len(stitched_seq)
stitched_seq = stitched_seq.rstrip("-")
num_gaps_end = tmp_len - len(stitched_seq)
stitched_seq = "%s%s%s" % (ref_seqrec.seq[:num_gaps_start],
stitched_seq,
ref_seqrec.seq[:-num_gaps_end])
stitched_seq = stitched_seq.replace("-", "")
stitched_seq = stitched_seq.replace("~", "")
# output
#
out_seqrec = SeqRecord(Seq(stitched_seq, IUPAC.ambiguous_dna),
id="stitched_seq",
description="Mosaic of %s and contigs" % (ref_seqrec.id))
SeqIO.write(out_seqrec, opts.fout, "fasta")
LOG.info("Stitched/Mosaic sequence with fake id %s written to %s" % (
out_seqrec.id, opts.fout))
if not opts.keep_temp_files:
for f in tmp_files:
os.unlink(f)
if __name__ == "__main__":
main()
LOG.info("Successful program exit")
LOG.warn("Best to check the output through alignment")
LOG.warn("Tilde in contigs are kept as insertions")