-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutility_functions.R
85 lines (77 loc) · 2.89 KB
/
utility_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# function to load genome info table from ucsc
getGenomeTable <- function(genome = "mm8"){
require(plyr)
gt_url <- sub(pattern = "GENOME",
replacement = genome,
x = "http://hgdownload.cse.ucsc.edu/goldenPath/GENOME/database/chromInfo.txt.gz",
ignore.case = F)
gt <- readLines(gzcon(url(gt_url, "rb")))
gt_df <- ldply(gt, function(x){
tmp <- unlist(strsplit(x , "\t"))
as.data.frame(cbind(chromosome = tmp[1],
size = as.numeric(tmp[2])),
stringsAsFactors = F)
})
gt_df
}
# granges shuffle function, similar to bedtools shuffle
grShuffle <- function(gr_in = NULL, genome_info = NULL, n_sets = 1, method = "chromosome"){
require(plyr)
gi <- genome_info
# remove chromosomes which are not in gr
gi <- gi[gi$chromosome %in% levels(seqnames(gr_in)),]
gi[,"size"] <- as.numeric(gi[,"size"])
# chromosome ends in genomic length cooridinates
chromosome_ends <- laply(1:nrow(gi), function(idx){
sum(as.numeric(gi[1:idx,"size"]))})
# adding chromosome ends in genome coordinates to genome info
gi <- cbind(gi, chromosome_ends)
gr_out <- GRanges()
if(method == "chromosome"){
grl <- split(gr_in, seqnames(gr_in))
gr_out <- llply(1:nrow(gi), function(idx){
n <- length(grl[[gi[idx,"chromosome"]]])
chromosome <- gi[idx,"chromosome"]
new_starts <- runif(n, 1, as.integer(gi[idx,"size"]))
in_width <- width(grl[[gi[idx,"chromosome"]]])
c(gr_out, GRanges(Rle(rep(chromosome, n)),
IRanges(start = new_starts,
end = new_starts + in_width)))
})
gr_out <- unlist(GRangesList(gr_out))
gr_out
}else if(method == "genome"){
genome_length <- as.numeric(sum(gi[,"size"]))
# width of input genomic regions
in_width <- width(gr_in)
# new start positions in genomic coordinates
new_starts <- runif(length(gr_in), 1, genome_length)
# mapping genomic coordinates to chromosomal coordinates
# and return GRanges object
for(idx in 1:nrow(gi)){
chromosome <- gi[idx,"chromosome"]
if(idx == 1){
starts <- which(new_starts > 0 & new_starts <= gi[idx,"chromosome_ends"])
}else{
starts <- which(new_starts > gi[idx-1,"chromosome_ends"] & new_starts <= gi[idx,"chromosome_ends"])
}
start_pos <- gi[idx,3] - new_starts[starts]
end_pos <- gi[idx,3] - new_starts[starts] + in_width[starts]
tmp_gr <- GRanges(Rle(rep(gi[idx,"chromosome"], length(starts))),
IRanges(start_pos, end_pos))
gr_out <- c(gr_out, tmp_gr)
}
gr_out
}
}
# non parametric p vslues
nonParP <- function(observed, expected){
p.out <- sum(abs(expected) > observed)
p.at <- sum(abs(expected) == observed)
(p.out + p.at /2) / (length(expected) +1)
}
# outlier
outlier <- function(x, ...){
idx <- which(x %in% boxplot.stats(x, ...)$out)
idx
}