-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathget-SuperEnhancers.R
221 lines (182 loc) · 8.31 KB
/
get-SuperEnhancers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#####
# You can use this script to delineate super-enhancers from a .bed, *only* after it has been
# pre-processed by bamliquidator. Conveniently this can be done by riesling.py, which also
# allows for alternative stratifications of super-enhancers.
#
#
# Run as: Rscript get-SuperEnhancers.R input.bed output_directory
#
#
# **Input: A .bed from bamliquidator
# (A .bed with 7 columns, where the 7th column contains the normalized score for ranking.)
#
# ## This input might look like:
# V1 V2 V3 V4 V5 V6 V7
# 1 chr1 3514643 3515351 1 30438 . 0.17688850
# 2 chr1 4426753 4427110 2 4119 . 0.04747231
#
#
# **Output:
# Super-enhacners (defined by the point where the tangent reaches 1) and
# Stretch enhancers (>3 kb)
# Super-Stretch enhancers (> tangent cutoff AND >3kb)
#
#
# Copyright (c) 2014-2016 Nick Semenkovich <[email protected]>.
# https://nick.semenkovich.com/
#
# Developed for the Gordon Lab, Washington University in St. Louis (WUSTL)
# https://gordonlab.wustl.edu/
#
# This software is released under the MIT License:
# http://opensource.org/licenses/MIT
#
# Source: https://github.com/GordonLab/riesling-pipeline
#
# Includes approaches inspired by:
# https://stackoverflow.com/questions/29642867/drawing-a-tangent-to-the-plot-and-finding-the-x-intercept-using-r
# Force sane graphics output
options(bitmapType='cairo')
#### Cleanup
# dev.off(dev.list()["RStudioGD"]) # Clear the graphs
rm(list = ls(all = TRUE)) # Clear all ojects from workspace
cat("\014") # Reset the console
# argv[1] is the input bed file we're operating on.
args <- commandArgs(TRUE)
input_filename = args[1]
output_dir = args[2]
print(paste('Working on:', input_filename))
print(paste('Output dir:', output_dir))
bed <- read.table(input_filename, sep="\t", header=FALSE)
## Looks like:
# V1 V2 V3 V4 V5 V6 V7
# 1 chr1 3514643 3515351 1 30438 . 0.17688850
# 2 chr1 4426753 4427110 2 4119 . 0.04747231
if (!is.na(output_dir)) {
print(paste('Current directory is:', getwd()))
print(paste('Setting output directory to:', output_dir))
setwd(output_dir)
}
## Sort and scale axes
y = sort(bed[,c(7)]*(bed[,c(3)]-bed[,c(2)])) # normalized_counts * width
x = c(1:length(y))
ynorm = y*length(x)/max(y)
# plot(x, ynorm)
# plot(x, ynorm, log="xy")
spl <- smooth.spline(x, ynorm)
# pred <- predict(spl)
# lines(pred, col=2)
ynorm.prime <- diff(ynorm)/diff(x)
# plot(ynorm.prime)
pred.prime <- predict(spl, deriv=1)
# lines(pred.prime$y, col=2)
## Find where the tangent line first crosses 1
se_cutoff <- min(which(pred.prime$y > 1))
print(paste('Inflection at entry:', se_cutoff))
print(paste('Corresponding cutoff score:', y[[se_cutoff]]))
# Use the spline models to plot tangent to that point
pred0 <- predict(spl, x=se_cutoff, deriv=0)
pred1 <- predict(spl, x=se_cutoff, deriv=1)
# And compute intercepts for graphing
yint <- pred0$y - (pred1$y*se_cutoff)
xint <- -yint/pred1$y
################ Save subpopulation beds
y_se_cutoff = y[[se_cutoff]]
se_population = bed[bed$V7*(bed$V3-bed$V2) >= y_se_cutoff,]
te_population = bed[bed$V7*(bed$V3-bed$V2) < y_se_cutoff,]
write.table(se_population, file='0-se-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
write.table(te_population, file='0-te-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
# >3000 stretch
stretch_cutoff = 3000
stretch_population = bed[bed$V3-bed$V2 >= stretch_cutoff,]
stretch_se_population = se_population[se_population$V3-se_population$V2 >= stretch_cutoff,]
write.table(stretch_population, file='0-stretch-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
write.table(stretch_se_population, file='0-stretch-se-population.R.bed', quote=FALSE, sep='\t', col.names = FALSE, row.names = FALSE)
## Save a diagnostic plot
png(paste('se-cutoff.R.png', sep=''), width=800, height=800)
plot(x, ynorm, cex=0.5)
abline(h=0, col=8) # baseline zero
lines(spl, col=2) # spline
abline(h=ynorm[[se_cutoff]], col=8)
abline(v=se_cutoff, col=8)
points(pred0, col=2, pch=19) # point to predict tangent
lines(x, yint + pred1$y*x, col=3) # tangent (1st deriv. of spline at se_cutoff)
points(xint, 0, col=3, pch=19) # x intercept
dev.off()
###
# Export pie charts & histograms
###
## First, make some data frames to use ggplot2
se_sizes = se_population$V3-se_population$V2
se_signals = se_population$V7*se_sizes
te_sizes = te_population$V3-te_population$V2
te_signals = te_population$V7*te_sizes
# size_df = data.frame(c(se_sizes, te_sizes), factor(rep(c('se','te'), c(length(se_sizes), length(te_sizes)))))
# colnames(size_df) <- c('size', 'type')
# signal_df = data.frame(c(se_signals, te_signals), factor(rep(c('se','te'), c(length(se_signals), length(te_signals)))))
# colnames(signal_df) <- c('signal', 'type')
# Pie chart of % SE vs % TE [count]
png(paste('se-vs-te-count-pie.R.png', sep=''), width=800, height=800)
pie(c(nrow(se_population), nrow(te_population)),
labels=c(paste('Super Enhancers\n', nrow(se_population)),
paste('Traditional Enhancers\n', nrow(te_population))),
main="Number of Super- vs Traditional Enhancers")
dev.off()
# Stretch vs Non-Stretch
png(paste('stretch-vs-nonstretch-count-pie.R.png', sep=''), width=800, height=800)
pie(c(nrow(stretch_population), nrow(bed)-nrow(stretch_population)),
labels=c(paste('Stretch Enhancers\n', nrow(stretch_population)),
paste('Non-Stretch Enhancers\n', nrow(bed)-nrow(stretch_population))),
main="Number of Stretch vs Non-Stretch Enhancers")
dev.off()
# And SE stretch / traditional stretch, etc.
png(paste('se-te-stretch-vs-nonstretch-count-pie.R.png', sep=''), width=800, height=800)
pie(c(nrow(se_population)-nrow(stretch_se_population), nrow(stretch_se_population),
nrow(te_population)-(nrow(stretch_population)-nrow(stretch_se_population)),
nrow(stretch_population)-nrow(stretch_se_population)),
labels=c(paste('Super Enhancers\n', nrow(se_population)-nrow(stretch_se_population)),
paste('Super-Stretch Enhancers\n', nrow(stretch_se_population)),
paste('Traditional Enhancers\n', nrow(te_population)-(nrow(stretch_population)-nrow(stretch_se_population))),
paste('Traditional-Stretch Enhancers\n', nrow(stretch_population)-nrow(stretch_se_population))),
main="Number of Super- vs Traditional Enhancers w/ Stretch >3kb")
dev.off()
# Pie chart of % SE Signal vs % TE Signal
total_signal = sum(as.numeric(bed$V5))
se_signal = sum(as.numeric(se_population$V5))
# te_signal = sum(as.numeric(te_population$V5))
se_fraction = round(se_signal/total_signal*100, digits = 1)
te_fraction = 100-se_fraction
png(paste('se-vs-te-signal-pie.R.png', sep=''), width=800, height=800)
pie(c(se_fraction, te_fraction),
labels=c(paste('Super Enhancers\n', se_fraction, '%', sep = ''),
paste('Traditional Enhancers\n', te_fraction, '%', sep = '')),
main="Signal in Super- vs Traditional Enhancers")
dev.off()
###### Histograms
# Histogram of SE sizes
png(paste('se-size-histogram.R.png', sep=''), width=800, height=800)
hist(se_signals, breaks=10,
xlab = 'Super-Enhancer Sizes', ylab="Counts", main="Super-Enhancer Size Distribution")
dev.off()
png(paste('te-size-histogram.R.png', sep=''), width=800, height=800)
hist(te_signals, breaks=10,
xlab = 'Super-Enhancer Sizes', ylab="Counts", main="Traditional-Enhancer Size Distribution")
dev.off()
# Inverse hockeystick, honestly
# qplot(signal, data=signal_df[signal_df$type == 'se',], geom="histogram",
# xlab="Normalized ATAC Signal", ylab="Count")
######## Text Diagnostics
# Print some raw statistics to a text file
fh <- file("0-enhancer-stats.txt", "w")
writeLines(paste("Statistics for:", input_filename), con=fh)
writeLines(paste("SE Signal %:", round(se_fraction,2)), con=fh)
writeLines(paste("TE Signal %:", round(te_fraction,2)), con=fh)
writeLines(paste("SE Count:", nrow(se_population)), con=fh)
writeLines(paste("TE Count:", nrow(te_population)), con=fh)
writeLines(paste("SE Count %:", round(nrow(se_population)/nrow(bed)*100, 2)), con=fh)
writeLines(paste("TE Count %:", round(nrow(te_population)/nrow(bed)*100, 2)), con=fh)
writeLines(paste("Mean SE Size:", round(mean(se_sizes), 2)), con=fh)
writeLines(paste("Mean TE Size:", round(mean(te_sizes), 2)), con=fh)
writeLines(paste("Median SE Size:", median(se_sizes)), con=fh)
writeLines(paste("Median TE Size:", median(te_sizes)), con=fh)
close(fh)