forked from BDI-pathogens/phyloscanner
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathphyloscanner_clean_alignments.R
executable file
·199 lines (156 loc) · 12.4 KB
/
phyloscanner_clean_alignments.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/env Rscript
options("warn"=1)
suppressMessages(require(argparse, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(phyloscannerR, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(data.table, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(ape, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(phangorn, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(ggtree, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(phytools, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(network, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(scales, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(gtable, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(grid, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(gridExtra, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(kimisc, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(GGally, quietly=TRUE, warn.conflicts=FALSE))
suppressMessages(require(sna, quietly=TRUE, warn.conflicts=FALSE))
arg_parser <- ArgumentParser()
arg_parser$add_argument("tree", action="store", help="A string that begins the file names (including the path) of all input trees. e.g. path/to/RAxML_bestTree.InWindow_")
arg_parser$add_argument("alignment", action="store", help="A string that begins the file names (including the path) of the alignments. e.g. path/to/AlignedReadsInWindow_")
arg_parser$add_argument("outputString", action="store", help="A string that will be used to label all output files.")
arg_parser$add_argument("-og", "--outgroupName", action="store", help="The name of the tip in the phylogeny/phylogenies to be used as outgroup (if unspecified, trees will be assumed to be already rooted).")
arg_parser$add_argument("-m", "--multifurcationThreshold", help="If specified, short branches in the input tree will be collapsed to form multifurcating internal nodes. This is recommended; many phylogenetics packages output binary trees with short or zero-length branches indicating multifurcations. If a number, this number will be used as the threshold, with all branches strictly smaller collapsed. If 'g', it will be guessed from the branch lengths and the width of the genomic window (if appropriate). It is recommended that trees are examined by eye to check that they do appear to have multifurcations if 'g' is used.")
arg_parser$add_argument("-b", "--blacklist", action="store", help="A path and string that begins all the file names for pre-existing blacklist files.")
# General, bland options
arg_parser$add_argument("-od", "--outputDir", action="store", help="All output will be written to this directory. If absent, current working directory.")
arg_parser$add_argument("-v", "--verbose", action="store_true", default=FALSE, help="Talk about what the script is doing.")
arg_parser$add_argument("-x", "--tipRegex", action="store", default="^(.*)_read_([0-9]+)_count_([0-9]+)$", help="Regular expression identifying tips from the dataset. This expects up to three capture groups, for host ID, read ID, and read count (in that order). If the latter two groups are missing then read information will not be used. If absent, the default is '^(.*)_read_([0-9]+)_count_([0-9]+)$', which matches input from the phyloscanner pipeline where the host ID is the BAM file name.")
arg_parser$add_argument("-y", "--fileNameRegex", action="store", default="^\\D*([0-9]+)_to_([0-9]+)\\D*$", help="Regular expression identifying window coordinates. Two capture groups: start and end; if the latter is missing then the first group is a single numerical identifier for the window. If absent, input will be assumed to be from the phyloscanner pipeline, and the host ID will be the BAM file name.")
arg_parser$add_argument("-tfe", "--treeFileExtension", action="store", default="tree", help="The file extension for tree files (default tree).")
arg_parser$add_argument("-cfe", "--csvFileExtension", action="store", default="csv", help="The file extension for table files (default csv).")
arg_parser$add_argument("-sd", "--seed", action="store_true", help="Random number seed; used by the downsampling process, and also ties in some parsimony reconstructions can be broken randomly.")
arg_parser$add_argument("-ow", "--overwrite", action="store_true", help="Overwrite existing output files with the same names.")
# Normalisation options
arg_parser$add_argument("-nr", "--normRefFileName", action="store", help="Name of a file giving a normalisation constant for every genome position. Cannot be used simultaneously with -nc. If both -nr and -nc are absent then no normalisation will be performed.")
arg_parser$add_argument("-ns", "--normStandardiseGagPol", action="store_true", default=FALSE, help="An HIV-specific option: if true, the normalising constants are standardised so that the average on gag+pol equals 1. Otherwise they are standardised so the average on the whole genome equals 1.")
arg_parser$add_argument("-nc", "--normalisationConstants", action="store", help="Either a CSV file listing the file name for each tree (column 1) and the normalisation constant (column 2) or a single numerical normalisation constant to be applied to each window. If both -nr and -nc are absent then no normalisation will be performed.")
# Blacklisting
arg_parser$add_argument("-db", "--duplicateBlacklist", action="store", help="Perform blacklisting for likely contamination between samples in this data set, suggested by exact duplicate reads found in different samples. Use this option to specify a string that begins the file names (including their path) of the duplication data produced by phyloscanner_make_trees.py. For example, 'path/to/DuplicateReadCountsProcessed_InWindow_'. This option must be used in conjunction with the --rawBlacklistThreshold and/or --ratioBlacklistThreshold option.")
arg_parser$add_argument("-pbk", "--parsimonyBlacklistK", action="store", type="double", help="Perform parsimony-based blacklisting for likely contaminant sequences (including those originating from a sample outside the current data set). Use this option to specify the value of the within-host diversity penalty used (corresponding to the k parameter in the Sankoff option of the splitsRule argument). This option must be used in conjunction with the --rawBlacklistThreshold and/or --ratioBlacklistThreshold option.")
arg_parser$add_argument("-rwt", "--rawBlacklistThreshold", action="store", default=0, help="Used to specify a read count to be used as a raw threshold for blacklisting. --parsimonyBlacklistK and/or --duplicateBlacklist must also be used. If --parsimonyBlacklistK is used, we will blacklist any subgraph with a read count strictly less than this threshold. If --duplicateBlacklist is used, we will black list any duplicate read with a count strictly less than this threshold. The default value of 0 means nothing is blacklisted.")
arg_parser$add_argument("-rtt", "--ratioBlacklistThreshold", action="store", default=0, help="Used to specify a read count ratio (between 0 and 1) to be used as a threshold for blacklisting. --parsimonyBlacklistK and/or --duplicateBlacklist must also be used. If --parsimonyBlacklistK is used, we will blacklist a subgraph if the ratio of its read count to the total read count from the same host is strictly less than this threshold. If --duplicateBlacklist is used, we will black list a duplicate read if the ratio of its count to the count of the duplicate (from another host) is strictly less than this threshold.")
arg_parser$add_argument("-ub", "--dualBlacklist", action="store_true", default=F, help="Blacklist all reads from the minor subgraphs for all hosts established as dual by parsimony blacklisting.")
# Downsampling
arg_parser$add_argument("-dsl", "--maxReadsPerHost", action="store", type="integer", help="If given, blacklist to downsample read counts from each host to this number.")
arg_parser$add_argument("-dsb", "--blacklistUnderrepresented", action="store_true", help="If present and -dsl is given, blacklist hosts from trees where their total tip count does not reach the maximum.")
args <- arg_parser$parse_args()
verbose <- args$verbose
overwrite <- args$overwrite
tree.input <- args$tree
tree.directory <- dirname(tree.input)
tree.file.regex <- paste0("^", basename(tree.input), "(.*)\\.[A-Za-z]+$$")
alignment.input <- args$alignment
alignment.directory <- dirname(alignment.input)
alignment.file.regex <- paste0("^", basename(alignment.input), "(.*)\\.[A-Za-z]+$$")
blacklist.input <- args$blacklist
if(!is.null(blacklist.input)){
user.blacklist.directory <- dirname(blacklist.input)
user.blacklist.file.regex <- paste0("^", basename(blacklist.input), "(.*)\\.[A-Za-z]+$")
} else {
user.blacklist.directory <- NULL
user.blacklist.file.regex <- NULL
}
output.dir <- args$outputDir
if(is.null(output.dir)){
output.dir <- getwd()
}
output.string <- args$outputString
if(!overwrite & length(list.files(path=output.dir, pattern=paste0("^", output.string, ".*")))>0 ){
stop("Previous output with this output string (",output.string,") detected. Please re-run with --overwrite if you wish to overwrite this.")
}
outgroup.name <- args$outgroupName
if(is.null(outgroup.name)){
warning("No outgroup name provided. Trees are assumed to be correctly rooted.")
outgroup.name <- NULL
}
use.m.thresh <- !is.null(args$multifurcationThreshold)
tree.fe <- args$treeFileExtension
csv.fe <- args$csvFileExtension
seed <- args$seed
if(use.m.thresh){
if(args$multifurcationThreshold=="g"){
m.thresh <- NA
} else if(!is.na(as.numeric(args$multifurcationThreshold))){
m.thresh <- as.numeric(args$multifurcationThreshold)
} else {
stop("Unknown argument for --multifurcationThreshold specified\n")
}
} else {
m.thresh <- -1
}
norm.ref.file.name <- args$normRefFileName
norm.standardise.gp <- args$normStandardiseGagPol
norm.constants.input <- args$normalisationConstants
tip.regex <- args$tipRegex
file.name.regex <- args$fileNameRegex
if(!is.null(norm.ref.file.name) & !is.null(norm.constants.input)){
warning("Normalisation reference file name and predetermined normalisation constants both specified. Only the latter will be used.")
}
do.dup.blacklisting <- !is.null(args$duplicateBlacklist)
dup.input.file.name <- args$duplicateBlacklist
if(!is.null(dup.input.file.name)){
duplicate.file.directory <- dirname(dup.input.file.name)
duplicate.file.regex <- paste0("^", basename(dup.input.file.name), "(.*)\\.[A-Za-z]+$")
} else {
duplicate.file.directory <- NULL
duplicate.file.regex <- NULL
}
do.par.blacklisting <- !is.null(args$parsimonyBlacklistK)
par.blacklisting.k <- args$parsimonyBlacklistK
do.dual.blacklisting <- args$dualBlacklist
if(do.dual.blacklisting & !do.par.blacklisting){
warning("Dual blacklisting requires parsimony blacklisting. Turning dual blacklisting off.")
do.dual.blacklisting <- F
}
bl.raw.threshold <- as.numeric(args$rawBlacklistThreshold)
bl.ratio.threshold <- as.numeric(args$ratioBlacklistThreshold)
if(do.par.blacklisting & bl.raw.threshold == 0 & bl.ratio.threshold == 0){
stop("Parsimony blacklisting requested but no thresholds specified with -rwt or -rtt")
}
if(do.dup.blacklisting & bl.raw.threshold == 0 & bl.ratio.threshold == 0){
stop("Duplicate blacklisting requested but no thresholds specified with -rwt or -rtt")
}
downsample <- !is.null(args$maxReadsPerHost)
downsampling.limit <- args$maxReadsPerHost
if(is.null(downsampling.limit)){
downsampling.limit <- Inf
}
blacklist.ur <- args$blacklistUnderrepresented
success <- remove.blacklist.from.alignment(
tree.directory,
alignment.directory,
tree.file.regex,
alignment.file.regex,
outgroup.name,
m.thresh,
is.na(m.thresh),
user.blacklist.directory,
user.blacklist.file.regex,
duplicate.file.directory,
duplicate.file.regex,
tip.regex,
file.name.regex,
seed,
norm.ref.file.name,
norm.standardise.gp,
norm.constants.input,
par.blacklisting.k,
bl.raw.threshold,
bl.ratio.threshold,
do.dual.blacklisting,
downsampling.limit,
blacklist.ur,
output.string,
verbose,
T)