forked from BDI-pathogens/phyloscanner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphyloscanner_make_trees.py
executable file
·2119 lines (1922 loc) · 91.5 KB
/
phyloscanner_make_trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
from __future__ import print_function
## Author: Chris Wymant, [email protected]
## Acknowledgement: I wrote this while funded by ERC Advanced Grant PBDR-339251
##
## Overview:
ExplanatoryMessage = '''phyloscanner_make_trees.py infers phylogenies containing
both within- and between-host pathogen genetic diversity in windows along the
genome, using mapped reads as input. For each window, for each sample, all reads
mapped to that window are found and identical reads are collected together with
an associated count. Then all reads from all samples in each window are aligned
using MAFFT and a phylogeny is constructed using RAxML. Temporary files and
output files are written to the current working directory; to avoid overwriting
existing files, you should run phyloscanner_make_trees.py from inside an empty
directory.
More information on running phyloscanner can be found in the manual. Options are
explained below; for some options we go into greater detail in the manual,
trying to be as concise as possible here.'''
################################################################################
# The names of some files we'll create.
FileForAlignedRefs = 'RefsAln.fasta'
# Some temporary working files we'll create
TempFileForRefs = 'temp_refs.fasta'
TempFileForPairwiseUnalignedRefs = 'temp_2Refs.fasta'
TempFileForPairwiseAlignedRefs = 'temp_2RefsAln.fasta'
TempFileForReads_basename = 'temp_UnalignedReads'
TempFileForOtherRefs_basename = 'temp_OtherRefs'
TempFileForAllBootstrappedTrees_basename = 'temp_AllBootstrappedTrees'
################################################################################
GapChar = '-'
import os
import collections
import itertools
import subprocess
import sys
import re
import copy
import shutil
import glob
import time
import argparse
import pysam
from Bio import SeqIO
from Bio import Seq
from Bio import AlignIO
from Bio import Align
import tools.phyloscanner_funcs as pf
# Define a function to check files exist, as a type for the argparse.
def File(MyFile):
if not os.path.isfile(MyFile):
raise argparse.ArgumentTypeError(MyFile+' does not exist or is not a file.')
return MyFile
# Define a function to check directories exist, as a type for the argparse.
def Dir(MyDir):
if not os.path.isdir(MyDir):
raise argparse.ArgumentTypeError(MyDir + \
' does not exist or is not a directory.')
return MyDir
# Define a comma-separated integers object, as a type for the argparse.
def CommaSeparatedInts(MyCommaSeparatedInts):
try:
ListOfInts = [int(value) for value in MyCommaSeparatedInts.split(',')]
except:
raise argparse.ArgumentTypeError('Unable to understand ' +\
MyCommaSeparatedInts + ' as comma-separated integers.')
else:
return ListOfInts
# A class to have new lines in argument help text
class SmartFormatter(argparse.HelpFormatter):
def _split_lines(self, text, width):
if text.startswith('R|'):
return text[2:].splitlines()
return argparse.HelpFormatter._split_lines(self, text, width)
# Set up the arguments for this script
parser = argparse.ArgumentParser(description=ExplanatoryMessage,
formatter_class=SmartFormatter)
# Positional args
parser.add_argument('BamAndRefList', type=File,
help='''R|A csv-format file listing the bam and reference files
(i.e. the fasta-format file containing the sequence to
which the reads were mapped). The first column should
be the bam file, the second column the corresponding
reference file, with a comma separating the two. An
optional third column, if present, will be used to
rename the bam files in all output. For example:
PatientA.bam,PatientA_ref.fasta,A
PatientB.bam,PatientB_ref.fasta,B''')
parser.add_argument('-Q', '--quiet', action='store_true', help='''Turns off the
small amount of information printed to the terminal (via stdout). We'll still
print warnings and errors (via stderr), and the command you ran, for logging
purposes.''')
parser.add_argument('-V', '--verbose', action='store_true', help='''Print a
little more information than usual. The --time option also provides extra
information on progress.''')
WindowArgs = parser.add_argument_group('Window options - you must choose'
' exactly one of -W, -AW or -E')
WindowArgs.add_argument('-W', '--windows', type=CommaSeparatedInts,
help='''Used to specify a comma-separated series of paired coordinates defining
the boundaries of the windows. e.g. 1,300,301,600,601,900 would define windows
1-300, 301-600, 601-900.''')
WindowArgs.add_argument('-AW', '--auto-window-params',
type=CommaSeparatedInts, help='''Used to specify 2, 3 or 4 comma-separated
integers controlling the automatic creation of regular windows. The first
integer is the width you want windows to be. (If you are not using the
--pairwise-align-to option, columns in the alignment of all references are
weighted by their non-gap fraction; see the manual for clarification.) The
second integer is the overlap between the end of one window and the start of the
next. The third integer, if specified, is the start position for the first
window (if not specified this is 1). The fourth integer, if specified, is the
end position for the last window (if not specified, windows will continue up to
the end of the reference or references).''')
WindowArgs.add_argument('-E', '--explore-window-widths',
type=CommaSeparatedInts, help='''Use this option to explore how the number of
unique reads found in each bam file in each window, all along the genome,
depends on the window width. After this option specify a comma-separated list of
integers. The first integer is the starting position for stepping along the
genome. Subsequent integers are window widths to try. Output is written to the
file specified with the --explore-window-width-file option.''')
WindowArgs.add_argument('-EF', '--explore-window-width-file', help='Used to '
'specify an output file for window width data, when the '
'--explore-window-widths option is used. Output is in in csv format.')
RecommendedArgs = parser.add_argument_group('Options we particularly recommend')
RecommendedArgs.add_argument('-A', '--alignment-of-other-refs', type=File,
help='''Used to specify an alignment of reference sequences for inclusion with
the reads, for comparison. (which need not be
those used to produce the bam files) that will be cut into the same windows as
the bam files and included in the alignment of reads, for comparison. This is
required if phyloscanner is to analyse the trees it produces.''')
RecommendedArgs.add_argument('-2', '--pairwise-align-to', help='''By default,
phyloscanner figures out where corresponding windows are in different bam files
by creating a multiple sequence alignment containing all of the mapping
references used to create the bam files (plus any extra references included with
-A), and window coordinates are intepreted with respect to this alignment.
However using this option, the mapping references used to create the bam files
are each separately pairwise aligned to one of the extra references included
with -A, and window coordinates are interpreted with respect to this
reference. Specify the name of the reference to use after this option.''')
RAxMLdefaultOptions = "-m GTRCAT -p 1 --no-seq-check"
RaxmlHelp ='''Use this option to specify how RAxML is to be run, including
both the executable (with the path to it if needed), and the options. If you do
not specify anything, we will try to find the fastest RAxML exectuable available
(assuming its path is in your PATH environment variable) and use the
options''' + RAxMLdefaultOptions + '''. -m tells RAxML which evolutionary model
to use, and -p specifies a random number seed for the parsimony inferences; both
are compulsory. You may include any other RAxML options in this command. The set
of things you specify with --x-raxml need to be surrounded with one pair of
quotation marks (so that they're kept together as one option for phyloscanner
and only split up for raxml). If you include a path to your raxml binary, it may
not include whitespace, since whitespace is interpreted as separating raxml
options. Do not include options relating to bootstraps: use phyloscanner's
--num-bootstraps and --bootstrap-seed options instead. Do not include options
relating to the naming of files.'''
RecommendedArgs.add_argument('--x-raxml', help=RaxmlHelp)
RecommendedArgs.add_argument('-P', '--merge-paired-reads', action='store_true',
help='''Relevant only for paired-read data for which the mates in a pair
(sometimes) overlap with each other: merge overlapping mates into a single
(longer) read.''')
RecommendedArgs.add_argument('-CR', '--check-recombination',
action='store_true', help='''Calculates a metric of recombination for each
sample's set of reads in each window. (Recommended only if you're interested, of
course.) Calculation time scales cubically with the number of unique sequences
each sample has per window, and so is turned off by default.''')
QualityArgs = parser.add_argument_group('Options intended to mitigate the '
'impact of poor quality reads')
QualityArgs.add_argument('-I', '--discard-improper-pairs', action='store_true',
help='''For paired-read data, discard all reads that are flagged as improperly
paired.''')
QualityArgs.add_argument('-Q1', '--quality-trim-ends', type=int, help='''Used to
specify a quality threshold for trimming the ends of reads. We trim each read
inwards until a base of this quality is met.''')
QualityArgs.add_argument('-Q2', '--min-internal-quality', type=int, help=\
'''Used to specify an interal quality threshold for reads. Reads are allowed at
most one base below this quality; any read with two or more bases below this
quality are discarded.''')
QualityArgs.add_argument('-MC', '--min-read-count', type=int, default=1, help=\
'''Used to specify a minimum count for each unique read.
Reads with a count (i.e. the number of times that sequence was observed,
after merging if merging is being done) less than this value are discarded.
The default value of 1 means all reads are kept.''')
OtherArgs = parser.add_argument_group('Other assorted options')
OtherArgs.add_argument('-XC', '--excision-coords', type=CommaSeparatedInts,
help='Used to specify a comma-separated set of integer coordinates that will '
'be excised from the aligned reads. Requires the -XR flag.')
OtherArgs.add_argument('-XR', '--excision-ref', help='''Used to specify the name
of a reference (which must be present in the file you specify with -A) with
respect to which the coordinates specified with -XC are interpreted. If you are
also using the --pairwise-align-to option, you must use the same reference there
and here.''')
OtherArgs.add_argument('-MTA', '--merging-threshold-a', type=int, default=0,
help='''Used to specify a similarity threshold for merging similar reads: reads
that differ by a number of bases equal to or less than this are merged, those
with higher counts effectively absorbing those with lower counts. The default
value of 0 means there is no merging. In this version of the merging algorithm,
if B is similar enough to merge into C, and A is similar enough to merge into B
but not into C (A -> B -> C), B will be merged into C and A will be kept
separate. Contrast with --merging-threshold-b.''')
OtherArgs.add_argument('-MTB', '--merging-threshold-b', type=int, default=0,
help='''Similar to --merging-threshold-a, except that in this version of the
merging algorithm, if B is similar enough to merge into C, and A is similar
enough to merge into B but not into C (A -> B -> C), both A and B will be merged
into C.''')
OtherArgs.add_argument('-N', '--num-bootstraps', type=int,
help='Used to specify the number of bootstraps to be calculated for RAxML trees'
' (by default, none, i.e. only the ML tree is calculated).')
OtherArgs.add_argument('-Ns', '--bootstrap-seed', type=int, default=1, help='''
Used to specify the random-number seed for running RAxML with bootstraps. The
default is 1.''')
OtherArgs.add_argument('-OD', '--output-dir', help='''Used to specify the name
of a directory into which output files will be moved.''')
OtherArgs.add_argument('--time', action='store_true',
help='Print the times taken by different steps.')
OtherArgs.add_argument('--x-mafft', default='mafft', help=\
'Used to specify the command required to run mafft (by default: mafft).')
OtherArgs.add_argument('--x-samtools', default='samtools', help=\
'Used to specify the command required to run samtools, if needed (by default: '
'samtools).')
OtherArgs.add_argument('-KO', '--keep-output-together', action='store_true',
help='''By default, subdirectories are made for different kinds of phyloscanner
output. With this option, all output files will be in the same directory (either
the working directory, or whatever you specify with --output-dir).''')
OtherArgs.add_argument('-KT', '--keep-temp-files', action='store_true', help='''
Keep temporary files we create on the way (these are deleted by default).''')
BioinformaticsArgs = parser.add_argument_group('Options for detailed'
' bioinformatic interrogation of the input bam files (not intended for normal'
' usage)')
BioinformaticsArgs.add_argument('-IO', '--inspect-disagreeing-overlaps',
action='store_true', help='''With --merge-paired-reads, those pairs that
overlap but disagree are discarded. With this option, these discarded pairs
are written to a bam file (one per patient, with their reference file copied
to the working directory) for your inspection.''')
BioinformaticsArgs.add_argument('-RN1', '--read-names-1', action='store_true',
help='''Produce a file for each window and each bam, listing the names of the
reads that phyloscanner used.''')
BioinformaticsArgs.add_argument('-RN2', '--read-names-2', action='store_true',
help='''As --read-names-1, except the files will show the correspondence between
read names and which unique sequence they correspond to.''')
BioinformaticsArgs.add_argument('--exact-window-start', action='store_true',
help='''Normally phyloscanner retrieves all reads that fully
overlap a given window, i.e. starting at or anywhere before the window start,
and ending at or anywhere after the window end. If this option is used without
--exact-window-end, the reads that are retrieved are those that start at exactly
the start of the window, and end anywhere (ignoring all the window end
coordinates specified). If this option is used with --exact-window-end, for a
read to be kept it must start at exactly the window start AND end at exactly the
window end.''')
BioinformaticsArgs.add_argument('--exact-window-end', action='store_true',
help='''With this option, the reads that are retrieved are those that end at
exactly the end of the window. Read the --exact-window-start help.''')
BioinformaticsArgs.add_argument('-CE', '--recover-clipped-ends',
action='store_true', help ='''This option recovers (soft-)clipped read ends by
setting the bases they contain to be mapped, assuming no indels inside the
clipped end. WARNING: mapping software clips the ends of reads for a reason.''')
StopEarlyArgs = parser.add_argument_group('Options to only partially run '
'phyloscanner, stopping early or skipping steps')
StopEarlyArgs.add_argument('-AO', '--align-refs-only', action='store_true',
help='''Align the mapping references used to create the bam files, plus any
extra reference sequences specified with -A, then quit without doing anything
else.''')
StopEarlyArgs.add_argument('-RNO', '--read-names-only', action='store_true',
help='''To be combined with --read-names-1 or --read-names-2: quit after writing
the read names to a file (which means the reads are not aligned).''')
StopEarlyArgs.add_argument('-T', '--no-trees', action='store_true',
help='Process and align the reads from each window, then quit without making '
'trees.')
StopEarlyArgs.add_argument('-D', '--dont-check-duplicates', action='store_true',
help="Don't compare reads between samples to find duplicates - a possible "+\
"indication of contamination. (By default this check is done.)")
DeprecatedArgs = parser.add_argument_group('''Deprecated options, left here for
backward compatability or interest.''')
DeprecatedArgs.add_argument('-C', '--contaminant-count-ratio', type=float,
help='''Used to specify a numerical value which is interpreted in the following
'way: if a sequence is found exactly duplicated between any two bam files,
'and is more common in one than the other by a factor at least equal to this
'value, the rarer sequence is deleted and goes instead into a separate
contaminant read fasta file. (This is considered deprecated because including the
exact duplicates in the tree and dealing with them during tree analysis is more
flexible.)''')
DeprecatedArgs.add_argument('-CO', '--flag-contaminants-only',
action='store_true',
help="For each window, just flag contaminant reads then move on (without "
"aligning reads or making a tree). Only makes sense with the -C flag.")
#DeprecatedArgs.add_argument('-CD', '--contaminant-read-dir', type=Dir,
#help='''A directory containing the contaminant read fasta files produced by a
#previous run of phyloscanner using the the -C flag:
#reads flagged as contaminants there will be considered contaminants in this run
#too. The windows must exactly match up between these two runs, most easily
#achieved with the -2 option. The point of this option is to first do a run with
#every bam file you have in which there could conceivably be cross-contamination,
#using the -C flag (and possibly -CO to save time), and then in subsequent runs
#focussing on subsets of bam files you will be able to identify contamination
#from outside that subset.''')
DeprecatedArgs.add_argument('-FR', '--forbid-read-repeats', action='store_true',
help='''Using this option, if a read with the same name is found
to span each of a series of consecutive, overlapping windows, it is only used in
the first window.''')
DeprecatedArgs.add_argument('-O', '--keep-overhangs', action='store_true',
help='''Keep the part of the read that overhangs the edge of the window. (By
default this is trimmed, i.e. only the part of the read inside the window is
kept.)''')
DeprecatedArgs.add_argument('-RC', '--ref-for-coords', help='''By default,
window coordinates are interpreted as alignment coordinates, in the multiple
sequence alignment of all references (which phyloscanner creates). With this
option, window coordinates are interpreted with respect to a named reference in
that alignment. Specify the name of the desired reference after this option.
This is deprecated because the --pairwise-align-to option is expected to perform
better.''')
DeprecatedArgs.add_argument('-RG', '--recombination-gap-aware',
action='store_true',
help='''By default, when calculating Hamming distances for the recombination
metric, positions with gaps are ignored. With this option, the gap character
counts as a fifth base.''')
DeprecatedArgs.add_argument('-RD', '--recombination-norm-diversity',
action='store_true', help='''By default, the normalising constant for the
recombination metric is half the number of sites in the window; with this option
it's half the number of sites in the window that are polymorphic for that bam
file.''')
args = parser.parse_args()
# Shorthand
WindowCoords = args.windows
UserSpecifiedCoords = args.windows != None
AutoWindows = args.auto_window_params != None
IncludeOtherRefs = args.alignment_of_other_refs != None
QualTrimEnds = args.quality_trim_ends != None
ImposeMinQual = args.min_internal_quality != None
ExcisePositions = args.excision_coords != None
PairwiseAlign = args.pairwise_align_to != None
FlagContaminants = args.contaminant_count_ratio != None
#RecallContaminants = args.contaminant_read_dir != None
RecallContaminants = False
CheckDuplicates = not args.dont_check_duplicates
ExploreWindowWidths = args.explore_window_widths != None
MergeReadsA = args.merging_threshold_a > 0
MergeReadsB = args.merging_threshold_b > 0
MergeReads = MergeReadsA or MergeReadsB
PrintInfo = not args.quiet
RecombNormToDiv = args.recombination_norm_diversity
# Print how this script was called, for logging purposes.
print('phyloscanner was called thus:\n' + ' '.join(sys.argv))
# Warn if RAxML files exist already.
if glob.glob('RAxML*'):
print('Warning: RAxML files are present in the working directory. If their',
'names clash with those that phyloscanner will try to create, RAxML will',
'fail to run. Continuing.', file=sys.stderr)
TempFiles = set([])
# Try to make the output dir, if desired.
HaveMadeOutputDir = False
if args.output_dir != None:
if os.path.isdir(args.output_dir):
HaveMadeOutputDir = True
else:
try:
os.mkdir(args.output_dir)
except:
print('Problem creating the directory', args.output_dir+\
". Is this a subdirectory inside a directory that doesn't exist? (We can",
"only create one new directory at a time - not a new directory inside",
"another new one.) Continuing.", file=sys.stderr)
else:
HaveMadeOutputDir = True
# Check only one merging type is specified. Thereafter use its threshold.
if MergeReadsA and MergeReadsB:
print('You cannot specify both --merging-threshold-a and',
'--merging-threshold-b. Quitting.', file=sys.stderr)
exit(1)
if MergeReadsA:
MergingThreshold = args.merging_threshold_a
elif MergeReadsB:
MergingThreshold = args.merging_threshold_b
# Check that window coords have been specified either manually or automatically,
# or we're exploring window widths
NumWindowOptions = len([Bool for Bool in [UserSpecifiedCoords, AutoWindows,
ExploreWindowWidths] if Bool == True])
if NumWindowOptions != 1:
print('Exactly one of the --windows, --auto-window-params,',
'--explore-window-widths options should specified. Quitting.',
file=sys.stderr)
exit(1)
# If using automatic windows (i.e. not specifying any coordinates), the user
# should not specify a reference for their coords to be interpreted with respect
# to, nor use a directory of contaminant reads (since windows must match to make
# use of contaminant reads).
if AutoWindows and args.ref_for_coords != None:
print('The --ref-for-coords and --auto-window-params',
'options should not be specified together: the first means your',
'coordinates should be interpreted with respect to a named reference, and',
"the second means you're not specfiying any coordinates. Quitting.",
file=sys.stderr)
exit(1)
if RecallContaminants and (not UserSpecifiedCoords):
print('If using the --contaminant-read-dir option you must also specify',
'windows with the --windows option, because the former requires that the',
'windows in the current run exactly match up with those from the run that',
'produces your directory of contaminant reads. Quitting.', file=sys.stderr)
exit(1)
# If coords were specified with respect to one particular reference,
# WindowCoords will be reassigned to be the translation of those coords to
# alignment coordinates. UserCoords are the original coords, which we use for
# labelling things to keep labels intuitive for the user.
UserCoords = WindowCoords
# Find contaminant read files and link them to their windows.
if RecallContaminants:
ContaminantFilesByWindow = {}
ContaminantFileRegex = FileForDuplicateSeqs_basename + \
'InWindow_(\d+)_to_(\d+)\.fasta'
ContaminantFileRegex2 = re.compile(ContaminantFileRegex)
LeftWindowEdges = UserCoords[::2]
RightWindowEdges = UserCoords[1::2]
PairedWindowCoords = zip(LeftWindowEdges, RightWindowEdges)
for AnyFile in os.listdir(args.contaminant_read_dir):
if ContaminantFileRegex2.match(AnyFile):
(LeftEdge, RightEdge) = ContaminantFileRegex2.match(AnyFile).groups()
(LeftEdge, RightEdge) = (int(LeftEdge), int(RightEdge))
if (LeftEdge, RightEdge) in PairedWindowCoords:
ContaminantFilesByWindow[(LeftEdge, RightEdge)] = \
os.path.join(args.contaminant_read_dir, AnyFile)
if len(ContaminantFilesByWindow) == 0:
print('Failed to find any files matching the regex', ContaminantFileRegex,
'in', args.contaminant_read_dir + '. Quitting.', file=sys.stderr)
exit(1)
# Check the contamination ratio is >= 1
if FlagContaminants and args.contaminant_count_ratio < 1:
print('The value specified with --contaminant-count-ratio must be greater',
'than 1. (It is the ratio of the more common duplicate to the less common',
'one at which we consider the less common one to be contamination; it',
"should probably be quite a lot larger than 1.) Quitting.", file=sys.stderr)
exit(1)
# Flagging contaminants requires that we check for duplicates
if args.dont_check_duplicates and FlagContaminants:
print('The --dont-check-duplicates and --contaminant-count-ratio options',
'cannot be used together: flagging contaminants requires that we check',
'duplicates. Quitting.', file=sys.stderr)
exit(1)
# The -XR and -XC flags should be used together or not all.
if (ExcisePositions and args.excision_ref == None) or \
((not ExcisePositions) and args.excision_ref != None):
print('The --excision-coords and --excision-ref options require each other:',
'use both, or neither. Quitting.', file=sys.stderr)
exit(1)
# --read-names-2 can't be used with read merging or position excising
if args.read_names_2 and (ExcisePositions or MergeReads):
print('The --read-names-2 option cannot be used with --merging-threshold-a,',
'--merging-threshold-b or --excision-coords, because they change the',
'correspondence initially established between unique sequences and reads.',
'Quitting''', file=sys.stderr)
exit(1)
# Sanity checks on using the pairwise alignment option.
if PairwiseAlign:
if args.ref_for_coords != None:
print('Note that if the --pairwise-align-to option is used, using the',
'--ref-for-coords as well is redundant.', file=sys.stderr)
if args.ref_for_coords != args.pairwise_align_to:
print('Furthermore you have chosen two different values for these flags,'
, 'indicating some confusion as to their use. Try again.')
exit(1)
if ExcisePositions and args.excision_ref != args.pairwise_align_to:
print('The --pairwise-align-to and --excision-ref options can only be',
'used at once if the same reference is specified for both. Qutting.',
file=sys.stderr)
exit(1)
def CheckMaxCoord(coords, ref):
'''Check that no coordinate is after the end of the reference with respect to
which it is supposed to be interpreted.'''
if max(coords) > len(ref.seq.ungap("-")):
print('You have specified at least one coordinate (', max(coords),
') that is larger than the length of the reference with respect to which',
' those coordinates are to be interpreted - ', ref.id, '. Quitting.',
sep='', file=sys.stderr)
exit(1)
def SanityCheckWindowCoords(WindowCoords):
'Check window coordinates come in pairs, all positive, the right > the left.'
NumCoords = len(WindowCoords)
if NumCoords % 2 != 0:
raise ValueError('An even number of WindowCoords must be specified. '+\
'Quitting.')
if any(coord < 1 for coord in WindowCoords):
raise ValueError('All WindowCoords must be greater than zero. Quitting.')
LeftWindowEdges = WindowCoords[::2]
RightWindowEdges = WindowCoords[1::2]
PairedWindowCoords = zip(LeftWindowEdges, RightWindowEdges)
for LeftWindowEdge, RightWindowEdge in PairedWindowCoords:
if LeftWindowEdge >= RightWindowEdge:
raise ValueError('You specified a window as having left edge ' +\
str(LeftWindowEdge) +' and right edge ' +str(RightWindowEdge)+\
'. Left edges should be less than their right edges. Quitting.')
exit(1)
return NumCoords
# Sanity checks on user specified WindowCoords
if UserSpecifiedCoords:
NumCoords = SanityCheckWindowCoords(WindowCoords)
# Sanity checks on auto window parameters
if AutoWindows:
NumAutoWindowParams = len(args.auto_window_params)
if not NumAutoWindowParams in [2,3,4]:
print('The --auto-window-params option requires 2, 3 or 4 integers.',
'Quitting.', file=sys.stderr)
exit(1)
WeightedWindowWidth = args.auto_window_params[0]
WindowOverlap = args.auto_window_params[1]
if NumAutoWindowParams > 2:
WindowStartPos = args.auto_window_params[2]
if WindowStartPos < 1:
print('The start position for the --auto-window-params option must be',
'greater than zero. Quitting.', file=sys.stderr)
exit(1)
if NumAutoWindowParams == 4:
WindowEndPos = args.auto_window_params[3]
else:
WindowEndPos = float('inf')
else:
WindowStartPos = 1
WindowEndPos = float('inf')
if WeightedWindowWidth <= 0:
print('The weighted window width for the --auto-window-params option must',
'be greater than zero. Quitting.', file=sys.stderr)
exit(1)
# Sanity checks on window-width exploration parameters
if ExploreWindowWidths:
if args.explore_window_width_file == None:
print('The --explore-window-widths option requires the',
'--explore-window-width-file option. Quitting.', file=sys.stderr)
exit(1)
try:
with open(args.explore_window_width_file, 'w') as f:
pass
except:
print('Unable to open', args.explore_window_width_file, 'for writing. (Is',
"it a file inside a directory that doesn't exist?). Quitting.",
file=sys.stderr)
raise
if len(args.explore_window_widths) < 2:
print('The --explore-window-widths option should be used to specify at',
'least two parameters; use the --help option for more information.',
'Quitting.', file=sys.stderr)
exit(1)
ExploreStart = args.explore_window_widths[0]
ExploreWidths = args.explore_window_widths[1:]
if ExploreStart < 1:
print('The start point for windows when exploring window widths (the '+\
'first integer specified with --explore-window-widths) cannot be less '+\
'than 1. Quitting.', file=sys.stderr)
exit(1)
ExploreWidths = sorted(ExploreWidths)
MinExploreWidth = ExploreWidths[0]
if MinExploreWidth < 2:
print('The minimum window width specified with --explore-window-widths',
'should be greater than 1. Quitting.', file=sys.stderr)
exit(1)
MaxExploreWidth = ExploreWidths[-1]
WindowWidthExplorationData = []
CheckDuplicates = False
def FindExploratoryWindows(EndPoint):
'''Returns the set of coordinates needed to step across the genome with the
desired start, end and window width.'''
# The EndPoint argument should be:
# * the ref length if --ref-for-coords or --pairwise-align-to is used
# * the length of the mapping ref if there's only one bam and no extra refs
# * otherwise, the length of the alignment of all refs
if EndPoint < ExploreStart + MaxExploreWidth:
print('With the --explore-window-widths option you specified a start',
'point of', ExploreStart, 'and your largest window width was',
str(MaxExploreWidth) + '; one or both of these values should be',
'decreased since the length of the reference or alignment of references',
'with respect to which we are interpreting coordinates is only',
str(EndPoint) + '. We need to be able to fit at least one window in',
'between the start and end. Quitting.', file=sys.stderr)
exit(1)
ExploratoryCoords = []
for width in ExploreWidths:
NextStart = ExploreStart
NextEnd = ExploreStart + width - 1
while NextEnd <= EndPoint:
ExploratoryCoords += [NextStart, NextEnd]
NextStart += width
NextEnd += width
return ExploratoryCoords
# Record the names of any external refs being included.
# If we're doing pairwise alignments, we'll also need gappy and gapless copies
# of the ref chosen for pairwise alignment.
# Check that any coordinates that are to be interpreted with respect to a named
# reference do not go past the end of that reference.
ExternalRefNames = []
if IncludeOtherRefs:
try:
ExternalRefAlignment = AlignIO.read(args.alignment_of_other_refs, "fasta")
except:
print('Problem reading', args.alignment_of_other_refs + ':',
file=sys.stderr)
raise
for ref in ExternalRefAlignment:
ExternalRefNames.append(ref.id)
if ref.id == args.pairwise_align_to:
RefForPairwiseAlnsGappySeq = str(ref.seq)
RefForPairwiseAlns = copy.deepcopy(ref)
RefForPairwiseAlns.seq = RefForPairwiseAlns.seq.ungap("-")
RefForPairwiseAlnsLength = len(RefForPairwiseAlns.seq)
if UserSpecifiedCoords:
CheckMaxCoord(WindowCoords, ref)
elif ExploreWindowWidths:
MaxCoordForWindowWidthTesting = RefForPairwiseAlnsLength
WindowCoords = FindExploratoryWindows(MaxCoordForWindowWidthTesting)
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
if ref.id == args.ref_for_coords:
if UserSpecifiedCoords:
CheckMaxCoord(WindowCoords, ref)
elif ExploreWindowWidths:
MaxCoordForWindowWidthTesting = len(ref.seq.ungap("-"))
WindowCoords = FindExploratoryWindows(MaxCoordForWindowWidthTesting)
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
if ref.id == args.excision_ref:
CheckMaxCoord(args.excision_coords, ref)
# Consistency checks on flags that require a ref.
for FlagName, FlagValue in (('--ref-for-coords', args.ref_for_coords),
('--pairwise-align-to', args.pairwise_align_to),
('--excision-ref', args.excision_ref)):
#('--ref-for-rooting', args.ref_for_rooting),
if FlagValue == None:
continue
if not IncludeOtherRefs:
print('The', FlagName, 'flag requires the --alignment-of-other-refs',
'flag. Quitting.', file=sys.stderr)
exit(1)
if not FlagValue in ExternalRefNames:
print('Reference', FlagValue +', specified with the', FlagName,
'flag, was not found in', args.alignment_of_other_refs +'. Quitting.',
file=sys.stderr)
exit(1)
# Remove duplicated excision coords. Sort from largest to smallest.
if ExcisePositions:
args.excision_coords = list(set(args.excision_coords))
args.excision_coords = sorted(args.excision_coords, reverse=True)
TranslateCoordsCode = pf.FindAndCheckCode('TranslateCoords.py')
FindSeqsInFastaCode = pf.FindAndCheckCode('FindSeqsInFasta.py')
FindWindowsCode = pf.FindAndCheckCode('FindInformativeWindowsInFasta.py')
# Test RAxML works
if not args.no_trees:
RAxMLargList = pf.TestRAxML(args.x_raxml, RAxMLdefaultOptions, RaxmlHelp)
times = []
if args.time:
times.append(time.time())
# Read in the input bam and ref files
BamFiles, RefFiles, BamAliases, BamFileBasenames = \
pf.ReadInputCSVfile(args.BamAndRefList)
NumberOfBams = len(BamFiles)
# Don't produce duplication files if there's only one bam.
if NumberOfBams == 1:
CheckDuplicates = False
# Read in all the reference sequences. Set each seq name to be the corresponding
# alias.
RefSeqs = []
for i,RefFile in enumerate(RefFiles):
SeqList = list(SeqIO.parse(open(RefFile),'fasta'))
if len(SeqList) != 1:
print('There are', len(SeqList), 'sequences in', RefFile+'. There should',
'be exactly 1.\nQuitting.', file=sys.stderr)
exit(1)
SeqList[0].id = BamAliases[i]
RefSeqs += SeqList
def TranslateCoords(CodeArgs):
'''Runs TranslateCoordsCode with the supplied args, and returns the results as
a dict.'''
try:
CoordsString = subprocess.check_output([TranslateCoordsCode]+CodeArgs)
except:
print('Problem executing', TranslateCoordsCode +'. Quitting.',
file=sys.stderr)
raise
CoordsDict = {}
for line in CoordsString.splitlines():
# Trim leading & trailing whitespace and skip blank lines
line = line.strip()
if line == '':
continue
# Each line in the output of the TranslateCoordsCode should be a sequence
# name then the coordinates.
fields = line.split()
if len(fields) != NumCoords +1:
print('Unexpected number of fields in line\n' +line +'\nin the output '+\
'of ' +TranslateCoordsCode+'\nQuitting.', file=sys.stderr)
exit(1)
SeqName = fields[0]
coords = fields[1:]
# Convert the coordinates to integers.
# Where an alignment coordinate is inside a deletion in a particular
# sequence, TranslateCoords.py returns an integer + 0.5 for the coordinate
# with respect to that sequence. Python won't convert such figures directly
# from string to int, but we can do so via a float intermediate. This rounds
# down, i.e. to the coordinate of the base immediately to the left of the
# deletion.
for i in range(len(coords)):
if coords[i] != 'NaN':
try:
coords[i] = int(coords[i])
except ValueError:
if '.5' in coords[i]:
coords[i] = int(float(coords[i]))
else:
print('Unable to understand the coordinate', coords[i],
'as an integer in line\n' +line +'\nin the output of '+\
TranslateCoordsCode+'\nQuitting.', file=sys.stderr)
exit(1)
CoordsDict[SeqName] = coords
return CoordsDict
def SimpleCoordsFromAutoParams(RefSeqLength):
WindowEndPosLocal = min(WindowEndPos, RefSeqLength)
if WindowEndPosLocal < WindowStartPos + WeightedWindowWidth:
print('With the --auto-window-params option you specified a start',
'point of', WindowStartPos, 'and your weighted window width was',
str(WeightedWindowWidth) + '; one or both of these values should be',
'decreased because the length of your reference or your',
'specified end point is only', str(WindowEndPosLocal) + '. We need to be',
'able to fit at least one window in between the start and end. Quitting.',
file=sys.stderr)
exit(1)
WindowCoords = []
NextStart = WindowStartPos
NextEnd = WindowStartPos + WeightedWindowWidth - 1
while NextEnd <= WindowEndPosLocal:
WindowCoords += [NextStart, NextEnd]
NextStart = NextEnd - WindowOverlap + 1
NextEnd = NextStart + WeightedWindowWidth - 1
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
return WindowCoords, UserCoords, NumCoords, WindowEndPosLocal
# If there is only one bam and no other refs, no coordinate translation
# is necessary - we use the coords as they are, though setting any after the end
# of the reference to be equal to the end of the reference.
if NumberOfBams == 1 and not IncludeOtherRefs:
if args.align_refs_only:
print('As you are supplying a single bam file and no external references,',
"the --align-refs-only option makes no sense - there's nothing to align.",
"Quitting.", file=sys.stderr)
exit(1)
RefSeqLength = len(RefSeqs[0])
if AutoWindows:
WindowCoords, UserCoords, NumCoords, WindowEndPos = \
SimpleCoordsFromAutoParams(RefSeqLength)
if ExploreWindowWidths:
MaxCoordForWindowWidthTesting = RefSeqLength
WindowCoords = FindExploratoryWindows(MaxCoordForWindowWidthTesting)
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
CoordsInRefs = {BamAliases[0] : WindowCoords}
# If there are at least two bam files, or if there is one but we're including
# other refs, we'll be aligning references and translating the user-specified
# coords with respect to each sequence, then storing those coords in a dict
# indexed by the ref's name.
else:
if args.verbose:
print('Now determining the correspondence between coordinates in different',
'bam files.')
# If we're separately and sequentially pairwise aligning our references to
# a chosen ref in order to determine window coordinates, do so now.
if PairwiseAlign:
# Get the coords if auto coords
if AutoWindows:
WindowCoords, UserCoords, NumCoords, WindowEndPos = \
SimpleCoordsFromAutoParams(RefForPairwiseAlnsLength)
# Find the coordinates with respect to the chosen ref, in the alignment of
# just the external refs - we'll need these later.
ExternalRefWindowCoords = \
pf.TranslateSeqCoordsToAlnCoords(RefForPairwiseAlnsGappySeq, WindowCoords)
CoordsInRefs = {}
for BamRefSeq in RefSeqs:
# Align
SeqIO.write([RefForPairwiseAlns,BamRefSeq], TempFileForPairwiseUnalignedRefs,
"fasta")
TempFiles.add(TempFileForPairwiseUnalignedRefs)
with open(TempFileForPairwiseAlignedRefs, 'w') as f:
try:
ExitStatus = subprocess.call([args.x_mafft, '--quiet',
'--preservecase', TempFileForPairwiseUnalignedRefs], stdout=f)
assert ExitStatus == 0
except:
print('Problem calling mafft. Quitting.', file=sys.stderr)
raise
TempFiles.add(TempFileForPairwiseAlignedRefs)
# Translate.
# The index names in the PairwiseCoordsDict, labelling the coords found by
# coord translation, should coincide with the two seqs we're considering.
PairwiseCoordsDict = TranslateCoords([TempFileForPairwiseAlignedRefs,
args.pairwise_align_to] + [str(coord) for coord in WindowCoords])
if set(PairwiseCoordsDict.keys()) != \
set([BamRefSeq.id,args.pairwise_align_to]):
print('Malfunction of phylotypes: mismatch between the sequences',
'found in the output of', TranslateCoordsCode, 'and the two names "' + \
BamRefSeq.id+'", "'+args.pairwise_align_to +'". Quitting.',
file=sys.stderr)
exit(1)
CoordsInRefs[BamRefSeq.id] = PairwiseCoordsDict[BamRefSeq.id]
# We're creating a global alignment of all references:
else:
# Put all the mapping reference sequences into one file. If an alignment of
# other references was supplied, add the mapping references to that
# alignment; if not, align the mapping references to each other.
SeqIO.write(RefSeqs, TempFileForRefs, "fasta")
TempFiles.add(TempFileForRefs)
if IncludeOtherRefs:
FinalMafftOptions = ['--add', TempFileForRefs, args.alignment_of_other_refs]
else:
FinalMafftOptions = [TempFileForRefs]
with open(FileForAlignedRefs, 'w') as f:
try:
ExitStatus = subprocess.call([args.x_mafft, '--quiet',
'--preservecase'] + FinalMafftOptions, stdout=f)
assert ExitStatus == 0
except:
print('Problem calling mafft. Quitting.', file=sys.stderr)
raise
if args.align_refs_only:
print('References aligned in', FileForAlignedRefs+ \
'. Quitting successfully.')
exit(0)
# If we're here and we're exploring window widths, we haven't defined the
# coordinates yet (because we haven't known the alignment length), unless
# --ref-for-coords was specified.
if ExploreWindowWidths and args.ref_for_coords == None:
for seq in SeqIO.parse(open(FileForAlignedRefs),'fasta'):
RefAlignmentLength = len(seq.seq)
break
MaxCoordForWindowWidthTesting = RefAlignmentLength
WindowCoords = FindExploratoryWindows(MaxCoordForWindowWidthTesting)
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
# If window coords were specified with respect to one particular reference,
# or if we are excising certain coords, translate to alignment coords.
if args.ref_for_coords != None or ExcisePositions:
for seq in SeqIO.parse(open(FileForAlignedRefs),'fasta'):
if seq.id == args.ref_for_coords:
WindowCoords = \
pf.TranslateSeqCoordsToAlnCoords(str(seq.seq), UserCoords)
if seq.id == args.excision_ref:
RefForExcisionGappySeq = str(seq.seq)
AlignmentExcisionCoords = pf.TranslateSeqCoordsToAlnCoords(
RefForExcisionGappySeq, args.excision_coords)
# Determine windows automatically if desired
if AutoWindows:
command = [FindWindowsCode, FileForAlignedRefs,
str(WeightedWindowWidth), str(WindowOverlap), '-S', str(WindowStartPos)]
if NumAutoWindowParams == 4:
command += ['-E', str(WindowEndPos)]
try:
WindowsString = subprocess.check_output(command)
except:
print('Problem executing', FindWindowsCode +'. Quitting.',
file=sys.stderr)
raise
try:
WindowCoords = [int(value) for value in WindowsString.split(',')]
assert len(WindowCoords) >= 2
except:
print('Unable to understand the', FindWindowsCode, 'output -',
WindowsString, '- as comma-separated integers. Quitting.',
file=sys.stderr)
raise
try:
NumCoords = SanityCheckWindowCoords(WindowCoords)
except ValueError:
print('Problematic output from ' +FindWindowsCode, file=sys.stderr)
raise
UserCoords = WindowCoords
# Translate alignment coordinates to reference coordinates
CoordsInRefs = TranslateCoords([FileForAlignedRefs, '-A']+\
[str(coord) for coord in WindowCoords])
# The index names in the CoordsInSeqs dicts, labelling the coords found by
# coord translation, should cooincide with all seqs we're considering (i.e.
# those in FileForAlignedRefs).
if set(CoordsInRefs.keys()) != set(BamAliases+ExternalRefNames):
print('Malfunction of phylotypes: mismatch between the sequences found',
'in the output of', TranslateCoordsCode, 'and those in',
FileForAlignedRefs +'. Quitting.', file=sys.stderr)
exit(1)
# Make index files for the bam files if needed.
pf.MakeBamIndices(BamFiles, args.x_samtools)
# Gather some data from each bam file
BamFileRefSeqNames = {}
BamFileRefLengths = {}
if args.verbose:
print('Now preparing the bam files for analysis.')
for i,BamFileName in enumerate(BamFiles):
BamFileBasename = BamFileBasenames[i]
BamAlias = BamAliases[i]
# Prep for pysam
try:
BamFile = pysam.AlignmentFile(BamFileName, "rb")
except AttributeError:
print('Error calling "pysam.AlignmentFile". The AlignmentFile attribute',
'was introduced in pysam version 0.8.1; are you using an older version',
'than that? You might be able to update by running\npip install pysam',
'--upgrade\nfrom the command line. Quitting.', file=sys.stderr)
exit(1)
except ValueError:
print('Error trying to read', BamFileName, 'as a bam file with pysam.',
'Quitting.', file=sys.stderr)
raise
# Find the reference in the bam file; there should only be one.
AllReferences = BamFile.references
if len(AllReferences) != 1:
print('Expected exactly one reference in', BamFileName+'; found',
str(len(AllReferences))+'.\nQuitting.', file=sys.stderr)
exit(1)
BamFileRefSeqNames[BamFileBasename] = AllReferences[0]
# Get the length of the reference.
AllReferenceLengths = BamFile.lengths
if len(AllReferenceLengths) != 1:
print('Pysam error: found one reference but', len(AllReferenceLengths),
'reference lengths.\nQuitting.', file=sys.stderr)
exit(1)
RefLength = AllReferenceLengths[0]
BamFileRefLengths[BamFileBasename] = RefLength
# When translating coordinates, -1 means before the sequence starts; 'NaN'
# means after it ends. These should be replaced by 1 and the reference length