-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathrunfeatureCountFUN.R
executable file
·406 lines (339 loc) · 18.1 KB
/
runfeatureCountFUN.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
suppressMessages(require(dplyr))
suppressWarnings(suppressMessages(require(GenomicRanges)))
suppressWarnings(suppressMessages(require(GenomicFeatures)))
suppressWarnings(suppressMessages(require(GenomicAlignments)))
suppressWarnings(suppressMessages(require(AnnotationDbi)))
# checkRsubreadVersion<- function(){
# if(length(grep("Rsubread",installed.packages()))==0){
# print("I did not find Rsubread so I am installing it...")
# BiocInstaller::biocLite("Rsubread",dependencies = TRUE, ask = FALSE)
# }else{
# if(all(as.numeric_version(packageVersion("Rsubread"))<'1.26.1')){
# print("I need newer Rsubread so I am updating it...")
# BiocInstaller::biocUpdatePackages("Rsubread", ask=FALSE)
# }
# }
# suppressMessages(require("Rsubread"))
# }
# reads information from Bam file on lengths and names of chromosomes/scaffolds. Can exclude scaffolds based on their size if desired.
.chromLengthFilter <- function(abamfile, keep_chr_minLength =0, samtoolsexc){
bread<- paste(samtoolsexc,"view -H ", abamfile , "| grep '^@SQ' | cut -f2,3 ")
chr<-data.table::fread(bread,col.names = c("chr","len"), header = F)[
, chr2 :=gsub("SN:","",chr)][
, len2 :=gsub("LN:","",len)][
, len2 := as.numeric(len2)][len2>=keep_chr_minLength]
out <- data.frame("name" = chr$chr2, "length" = chr$len2, stringsAsFactors = FALSE)
return(out)
}
# wrapper for creating genomic ranges
get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y,
keep.extra.columns=TRUE,
ignore.strand=FALSE,
seqinfo=NULL,
seqnames.field=c("seqnames", "seqname",
"chromosome", "chrom",
"chr", "chromosome_name",
"seqid"),
start.field="start",
end.field=c("end", "stop"),
strand.field="strand",
starts.in.df.are.0based=FALSE)
}
# Chris Alford's overall function for extending exons. Relies on .extension_funcion, .chromLengthFilter, get_gr,
.exon_extension <- function(gtf_input, exon_extension, buffer_length, chrom_kept){
suppressWarnings(suppressMessages(require(GenomicRanges)))
suppressWarnings(suppressMessages(require(GenomicFeatures)))
suppressWarnings(suppressMessages(require(GenomicAlignments)))
suppressWarnings(suppressMessages(require(AnnotationDbi)))
suppressWarnings(suppressMessages(require(plyranges)))
ex_ex <- exon_extension # amount to extent coding region by in bp
bl <- buffer_length # the amount of buffer space in bp between genes
print(paste("starting exon-extension at", Sys.time() ))
print(paste("extending by", exon_extension, "bp"))
# reading out chromosomes/scaffold names from the GRange gtf object
chrom_gtf_names <- tibble::as_tibble(x = list("chrom_names" = as.character(seqinfo(gtf_input)@seqnames)),
.name_repair = "universal")
# doing left join of length data from bam and names from gtf GRange to
# preserve order of chromosomes/scaffolds from the gtf
# adding length info back into GRange object from gtf file.
seqlengths(gtf_input) <- chrom_kept %>%
dplyr::left_join(chrom_gtf_names, ., by = c( "chrom_names" = "name")) %>%
dplyr::pull(length) # vectorizing length, as seqlengths() requires this
# first filtering for exons, making a concensus of transcripts gene wise with reduce, and adding index to keep track of position
filt_gtf <- gtf_input %>%
plyranges::filter(type == "exon") %>%
plyranges::group_by(gene_id) %>%
plyranges::reduce_ranges_directed() # adding index and start for manipulations
# Checking whether we will filter out scaffolds based on size here.
# if the seqnames of our filt_gtf are present in the chrom_kept object from our BAM file, then we keep.
filt_sub_gtf <- filt_gtf[as.character((seqnames(filt_gtf))) %in% chrom_kept$name]
# was having issue with out of bound indices, so removing scaffolds before adding index.
filt_sub_gtf <- filt_sub_gtf %>%
plyranges::mutate(index = 1:length(.),
start2 = start(.))
# From here we subset out max / min exons gene wise according to strand (+)/(-).
max_pos <- filt_sub_gtf %>%
plyranges::filter(strand(.) == "+") %>%
plyranges::group_by(gene_id) %>%
plyranges::filter(start2 == max(start2))
max_pos$start2 <- NULL # apparently plyranges no longer likes the option plyranges::select(-blah) , so setting metacolumn to null is more robust?
min_neg <- filt_sub_gtf %>%
plyranges::filter(strand(.) == "-") %>%
plyranges::group_by(gene_id) %>%
plyranges::filter(start2 == min(start2))
min_neg$start2 <-NULL # apparently plyranges no longer likes the option plyranges::select(-blah) , so setting metacolumn to null is more robust?
# removing start2 column from whole object
mcols(filt_sub_gtf)$start2 <- NULL
# doing preceed over whole object. This returns vector of the following exon for each index in ref to starting object
pre_all <- GenomicRanges::precede(filt_sub_gtf,
filt_sub_gtf,
ignore.strand = FALSE) # made a change here.
# Some entries will be NA in our pre_all vector.
# Most likely represent exons on scaffolds or last exon in chromosome in either direction
names(pre_all) <- 1:length(pre_all) # naming with position in vector
na_list <- as.numeric(names(pre_all[is.na(pre_all)])) # getting locations of NAs out. These correspond to ranges that have no following exon.
# dealing with positive NA's in preceed table
pos_na_ind <- filt_sub_gtf[na_list] %>%
plyranges::filter(strand(.) == "+")
pos_na_ind <- pos_na_ind$index
neg_na_ind <- filt_sub_gtf[na_list] %>%
plyranges::filter(strand(.) == "-")
neg_na_ind <- neg_na_ind$index
# actual extension for NAs. I don't have to go piecemeal, because I'm just assigning the same amount each time.
# also check for chromosome keeping is done above
end(filt_sub_gtf[pos_na_ind]) <- end(filt_sub_gtf[pos_na_ind]) + ex_ex
start(filt_sub_gtf[neg_na_ind]) <- start(filt_sub_gtf[neg_na_ind]) - ex_ex
# trimming extension to only go to end of scaffold / chromosome. Also prints which ones are out of bound.
# internal record of which ones are truncated
out_of_bound_ind <- GenomicRanges:::get_out_of_bound_index(filt_sub_gtf)
filt_sub_gtf <- trim(filt_sub_gtf)
# index of the 3' exons in + and - direction
index_neg <- mcols(min_neg)$index
index_pos <- mcols(max_pos)$index
# Create my list from dataframes
df <- data.frame(filt_sub_gtf)
# splits data frame into a list based on the gene ID
df_list<- df %>%
dplyr::group_split(gene_id)
.extension_function <- function(x, pre_all, df){
if (x %>%
dplyr::select(strand) %>% # check to see if the strand in positive
unique == "+"){
strand_var <- "pos" # Assign strand variable to "pos"
ind <- x %>%
dplyr::filter(end == max(end)) %>% # finding the exon that has the max "end" value and
dplyr::pull(index) # pulling the index for this entry
} else {strand_var <- "neg"
ind <- x %>% # likewise, doing the opposite for negative stranded genes
dplyr::filter(start == min(start)) %>%
dplyr::pull(index) # pulling index for min most exon for negative strand
}
if(is.na(pre_all[ind])){ # pulling following exon of min/max exons in each list entry
#print("NA value, skipping instance") # check for NA values in pre_all vector
return(x)
} else {
if(strand_var == "pos"){ # Check the strand var
small <- df[ind,]$end # If pos, assign preceeding exon as "small" and following exon as "big"
big <- df[pre_all[ind],]$start
} else {
big <- df[ind,]$start # if negative assign preceeding exon as big and following exon as small
small <- df[pre_all[ind],]$end
}
# actual extension part
if(big - small > ex_ex + bl){ # if distance between exons is greater than buffer and extension
ext <- ex_ex # then the full extension is used
} else {
if(big - small <= bl){ # Check if the exons are are closer than buffer already ( ex, if exons are 15 bp away or something)
ext <- 0 # if so, setting extension to 0
} else {
ext <- big - small - bl # If all else is false, then set extension to difference between start and end points, minus buffer
}
}
if(strand_var == "pos"){ #
#print(paste("positive gene index",
# ind,"extension of",ext))
x[x$index == ind,]$end <- x[x$index == ind,]$end + ext
return(x)
} else{
#print(paste("negative gene index",
# ind,"extension of",ext))
x[x$index == ind,]$start <- x[x$index == ind,]$start - ext
return(x)
}
}
return(x)
}
# extension function for the groups
# each entry in the list will be all exons for the gene
# index for each range represents position in original Grange.
df_out <- parallel::mclapply(df_list, .extension_function, pre_all = pre_all, df = df, mc.cores = opt$num_threads)
# unbinding list
df_output <- dplyr::bind_rows(df_out)
# getting output into a Genomic Range object
df_out_gr <- get_gr(df_output)
#final return
print("end extension")
return(df_out_gr)
}
#new version of makeSAF by Chris Alford
.makeSAF<-function(gtf, extension_var=FALSE, exon_extension=0, buffer_length=100, scaff_length=0,multi_overlap_var=FALSE, samtoolsexc){
print("Loading reference annotation from:")
print(gtf)
gtf_file <- plyranges::read_gff(gtf, col_names = c("type","gene_id"))
# Check for gene information in gtf file --------------
# if "gene" is in type column from GTF, uses those desingations. If not, creates
# such a field from the max-end and min-start values after grouping by gene_id.
if("gene" %in% levels(gtf_file$type)==TRUE){
# this represents total length of each gene (unextended)
gr.gene <- gtf_file %>%
dplyr::filter(type =="gene")
}else{
gr.gtf.filt <- gtf_file %>%
dplyr::filter(type =="exon")
gr.gene <- gr.gtf.filt %>%
group_by(gene_id, strand, seqnames) %>%
summarise(end = max(end),
start = min(start),
type = "gene" ) %>%
plyranges::as_granges()
}
#Calculating total intergenic length for intron-probability
totalGeneLength<-sum(width(gr.gene))
#get length of chromosomes from BAM header
chrL<-system( paste(samtoolsexc,"view -H", abamfile ," | grep '^@SQ' | cut -f3 "), intern = T)
chrL<-lapply(chrL, function(x){
x<-as.integer(strsplit(x, ":")[[1]][[2]])
})
chrL<-do.call(sum,chrL)
intergenicBp<- chrL-totalGeneLength
# getting names and length of chromosomes/scaffolds from bam header
chrom_kept <- .chromLengthFilter(abamfile = abamfile, keep_chr_minLength = scaff_length, samtoolsexc = samtoolsexc)
# reading out chromosomes/scaffold names from the GRange gtf object
chrom_gtf_names <- tibble::as_tibble(x = list("chrom_names" = as.character(seqinfo(gtf_file)@seqnames)),
.name_repair = "universal")
## Exon creation ##
# check to see if any extension is going to occur
# if so, then do extension for everything. If not still do basic filtering for exons, reducing for
# only scaffolds chosen to be kept.
# adding check for NULL, which would come from YAML with no extension option
if(is.null(extension_var)) {
extension_var <- FALSE
}
if(extension_var == TRUE) {
exon <- .exon_extension(gtf_input = gtf_file,
exon_extension = exon_extension,
buffer_length = buffer_length,
chrom_kept = chrom_kept)
} else {
filt_gtf <- gtf_file %>%
plyranges::filter(type == "exon") %>%
plyranges::group_by(gene_id) %>%
plyranges::reduce_ranges_directed()
exon <- filt_gtf[as.character((seqnames(filt_gtf))) %in% chrom_kept$name]
}
# this reduces all exons across all genes, and performs gaps to make introns
gap_obj <- exon %>%
plyranges::reduce_ranges() %>%
GenomicRanges::gaps()
# adding code to do disjoin for transcript objects, and then overlapping to get GeneIDs back into introns -CA 03.01.20
gr.gene.dis <- gr.gene %>%
plyranges::disjoin_ranges() %>%
plyranges::find_overlaps_within(.,gr.gene) %>%
plyranges::mutate(index = 1:length(.))
# additional code to ensure introns are only assigned to one gene annotation
rm_ind <- plyranges::find_overlaps(gr.gene.dis, # find overlapping ranges in same object from different genes, assign to "rm_ind"
gr.gene.dis,
minoverlap = 1) %>%
plyranges::filter(gene_id.x != gene_id.y) %>% # because this is a self-comparison, filtering out genes that match each other
data.frame() %>%
dplyr::pull(index.x) %>% # pull indices, as these represent ranges that are overlapping and belong to
unique() # different genes.
gr.gene.dis.filt <- gr.gene.dis %>%
plyranges::filter(!(index %in% rm_ind)) %>% # removing based on "rm_ind"
plyranges::select(-index) # getting rid of index after removal, as we don't need anymore
# this finds introns that are only inside of the disjoined gene's transcript object
intron <- plyranges::find_overlaps_within(gap_obj, gr.gene.dis.filt)
# filtering for introns to be greater than 10 bp and less than 1e5
intron.filt <- intron %>%
plyranges::filter(width > 10 & width < 1e5)
# Creation of intron and Exon Saf
intron.saf <- data.frame(GeneID = intron.filt$gene_id,
Chr = seqnames(intron.filt),
Start = start(intron.filt),
End = end(intron.filt),
Len = width(intron.filt), # needed for intron-probability
stringsAsFactors = FALSE)
exon.saf <- data.frame(GeneID = exon$gene_id,
Chr = seqnames(exon),
Start = start(exon),
End = end(exon),
Strand = strand(exon),
stringsAsFactors = F)
# Getting strand info back into intron saf
intron.saf <- dplyr::left_join(intron.saf, unique(exon.saf[,c("GeneID","Strand")]), by = c("GeneID"))
#get intronic bp per gene and intronic reads per gene
intronGenes <- intron.saf %>% group_by(GeneID) %>% summarize(IntronLengthPerGene = sum(Len), intronsPerGene = length(GeneID), .groups = "drop")
saf <- list(introns = unique(intron.saf), exons = unique(exon.saf), intronsPerGene = intronGenes,intergenicBp = intergenicBp)
print("Annotation loaded!")
rm(gr.gene,
intron,
exon,
intron.saf,
exon.saf,
intronGenes,
intergenicBp)
return(saf)
}
.runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var,fraction_overlap){
print(paste0("Assigning reads to features (",type,")"))
# fc.stat<-Rsubread::featureCounts(files=abamfile,
fc.stat <- featureCounts(files=abamfile,
annot.ext=saf,
isGTFAnnotationFile=F,
primaryOnly=primaryOnly,
allowMultiOverlap=multi_overlap_var,
countMultiMappingReads=primaryOnly,
nthreads=cpu,
reportReads="BAM",
strandSpecific=strand,
isPairedEnd=T,
countChimericFragments=F,
fcounts_clib = fcounts_clib,
largestOverlap = TRUE,
fracOverlap = fraction_overlap,
isIntronInput = ifelse(type == "in", 1, 0))$stat
fn<-paste0(abamfile,".featureCounts.bam")
nfn<-paste0(abamfile,".",type,".featureCounts.bam")
system(paste0("mv ",fn," ",nfn,".tmp"))
invisible(suppressWarnings(suppressMessages(gc(verbose=F))))
return(nfn)
}
.get_tx_lengths <- function(gtf){
txdb <- suppressWarnings(suppressMessages(GenomicFeatures::makeTxDbFromGFF(file=gtf, format="gtf")))
exon <- GenomicFeatures::exonsBy(txdb, by="gene")
exon <- reduce(exon)
len <- lapply(exon, width)
len <- sapply(len, sum, USE.NAMES=T)
len_dt <- data.table(
GeneID = names(len),
tx_bp = as.numeric(len)
)
len_dt <- len_dt[,.(tx_bp = median(tx_bp)),by = GeneID]
return(len_dt)
}
.get_gene_names <- function(gtf, threads){
gtf.dt <- fread(gtf, sep="\t",header=F)
ge <- gtf.dt[V3 == "gene"]
gtf_info <- ge$V9
info_parsed <- parallel::mclapply(gtf_info, function(x){
dat <- data.table(V1=unlist(strsplit(x,"; ")))
dat[,c("name","value") := tstrsplit(V1, " ")][
,V1 := NULL][
,value := gsub(pattern = "\"", replacement = "", x = value)]
dat <- dat[name %in% c("gene_id","gene_name")]
dat <- dcast(dat, .~name, value.var = "value")
dat[,"." := NULL]
}, mc.cores = threads)
info_parsed <- rbindlist(info_parsed)
return( info_parsed[! (is.na(gene_name) | is.na(gene_id))] )
}