-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCytoband_on_Survial.R
315 lines (303 loc) · 15 KB
/
Cytoband_on_Survial.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
#Cytoband Matrix
#########################
##Step 0: Libraries
#########################
require(survival); require(survminer)
'%!in%' <- function(x,y)!('%in%'(x,y))
#########################
##Step 0: Load Files
#########################
setwd('/Users/sinhas8/Project_Chromotrypsis/')
# mat=read.csv('2.Data/Corrected_NCIMD_HRD_by_LOH_and_GI.csv')
# mat_updated=read.csv('demo_updated.csv')
cytobands=read.csv('2.Data/cytobandv1', sep='\t')
#########################
##Step 0: Pre-Processing
#########################
##functions needed
getwd()
setwd('/Users/sinhas8/Project_Chromotrypsis/')
source('./3.Tools/race_specific_corr_wd_exp.R')
#########################
##Step 0: Loading files:: Exp and CNV
#########################
##Expression
Exp=cbind(read.csv('2.Data/Exp_AA_AD.csv'),
read.csv('2.Data/Exp_EA_AD.csv'),
read.csv('2.Data/Exp_AA_SC.csv'),
read.csv('2.Data/Exp_EA_SC.csv'))
rownames(Exp)=make.names(Exp[,1], unique = T)
Exp=Exp[,-1]
Exp=Exp[,-grep('X',colnames(Exp))]
colnames(Exp)=sapply(colnames(Exp), function(x) gsub("[^0-9]", "", x))
##CNV dataset for gene
setwd('/Users/sinhas8/Project_Chromotrypsis/')
Cancer_Type='LUSC'
CNV_AA=read.csv(paste('./4.Results/GISTIC_results/Trimmed_GISTIC_Results/',Cancer_Type, '_AA','/', 'all_data_by_genes.txt', sep=''), sep='\t', row.names=1)[,-(1:2)]
CNV_AA=CNV_AA[-which(duplicated(sapply(rownames(CNV_AA), function(x) strsplit(x, '\\|')[[1]][1]))),]
rownames(CNV_AA)=sapply(rownames(CNV_AA), function(x) strsplit(x, '\\|')[[1]][1])
CNV_EA=read.csv(paste('./4.Results/GISTIC_results/Trimmed_GISTIC_Results/',Cancer_Type, '_EA','/', 'all_data_by_genes.txt', sep=''), sep='\t', row.names=1)[,-(1:2)]
CNV_EA=CNV_EA[-which(duplicated(sapply(rownames(CNV_EA), function(x) strsplit(x, '\\|')[[1]][1]))),]
rownames(CNV_EA)=sapply(rownames(CNV_EA), function(x) strsplit(x, '\\|')[[1]][1])
CNV1=cbind(CNV_AA, CNV_EA)
Cancer_Type='LUAD'
CNV_AA=read.csv(paste('./4.Results/GISTIC_results/Trimmed_GISTIC_Results/',Cancer_Type, '_AA','/', 'all_data_by_genes.txt', sep=''), sep='\t', row.names=1)[,-(1:2)]
CNV_AA=CNV_AA[-which(duplicated(sapply(rownames(CNV_AA), function(x) strsplit(x, '\\|')[[1]][1]))),]
rownames(CNV_AA)=sapply(rownames(CNV_AA), function(x) strsplit(x, '\\|')[[1]][1])
CNV_EA=read.csv(paste('./4.Results/GISTIC_results/Trimmed_GISTIC_Results/',Cancer_Type, '_EA','/', 'all_data_by_genes.txt', sep=''), sep='\t', row.names=1)[,-(1:2)]
CNV_EA=CNV_EA[-which(duplicated(sapply(rownames(CNV_EA), function(x) strsplit(x, '\\|')[[1]][1]))),]
rownames(CNV_EA)=sapply(rownames(CNV_EA), function(x) strsplit(x, '\\|')[[1]][1])
CNV2=cbind(CNV_AA, CNV_EA)
CNV=cbind(CNV1, CNV2)
colnames(CNV)[grep('_',colnames(CNV))]=
sapply(gsub('X','',colnames(CNV)[grep('_',colnames(CNV))]), function(x) strsplit(x,'_')[[1]][2])
colnames(CNV)=gsub('X','',colnames(CNV))
colnames(CNV)[1]=substring(colnames(CNV)[1], 1, nchar(colnames(CNV)[1])-10)
colnames(CNV)=sapply(sapply(colnames(CNV), function(x) strsplit(x, '\\.')[[1]][1]),
function(x) gsub("[^0-9]", "", x))
#########################
##Step 0: pre-processing
#########################
# mat_updated=mat_updated[na.omit(match(mat$acc, mat_updated$acc)),]
# mat=mat[!is.na(match(mat$acc, mat_updated$acc)),]
# mat_updated=mat_updated[,c(1, 4, 6)]
# mat_updated$hist=mat$hist; mat_updated$race=mat$race
#gsub('chr','',cytobands$X.chrom)
cytobands$Chr=as.numeric(gsub('chr','',sapply(cytobands$X.chrom, function(x)
strsplit(as.character(x), '_')[[1]][1])))
cytobands=na.omit(cytobands)
cytobands$loc=sapply(seq(nrow(cytobands)), function(x)
paste(cytobands$Chr[x], cytobands$name[x], sep=''))
#########################
##Step 0: Define Functions
#########################
return_peaks<-function(hist='LUSC', race='AA'){
qq=read.csv(paste('4.Results/GISTIC_results/Trimmed_GISTIC_Results/',
hist, '_',
race, '/',
'all_lesions.conf_90.txt',
sep=''), sep='\t')
qq_FF=qq[qq$Amplitude.Threshold!=levels(qq$Amplitude.Threshold)[3],]
qq_FF=qq_FF[,-ncol(qq_FF)]
proc_CNV=qq_FF
rownames(proc_CNV)=paste0(proc_CNV$Unique.Name, '_',
proc_CNV$Descriptor)
proc_CNV=proc_CNV[,-c(1:9)]
proc_CNV=t(proc_CNV[,!is.na(match(substring(colnames(proc_CNV), 2, ), gsub('-','.',mat$File)) ) ])
surv = mat[na.omit(match(substring(rownames(proc_CNV), 2,),
gsub('-','.',mat$File) )) ,
c('lungcancer_death_all_years','survival', 'stage_trimmed')]
proc_CNV=proc_CNV[,apply(proc_CNV, 2, function(x) length(table(x)) )>1]
df=cbind(surv, proc_CNV)
By_Regions=t(apply(df[,-(1:3)], 2,
function(x) summary(coxph(Surv(df$survival,
df$lungcancer_death_all_years) ~
as.factor(x) +
df$stage_trimmed))$coefficients[1,c(2,5)]))
colnames(By_Regions)=c('Hazard_Ratio', 'P.Value')
# write.csv(By_Regions, paste('/home/sinhas8/Projects/Project_Chromotrypsis/2.Data/', hist, '_', race,
# 'Survival_by_Cytobands.csv', sep=''))
Region=gsub(' ','',sapply(rownames(By_Regions),
function(x) strsplit(x,'_')[[1]][2]))
df_peaks=data.frame(By_Regions,
Frequency=apply(proc_CNV, 2, function(x) sum(x>0)/length(x)),
Type=sapply(rownames(By_Regions),
function(x) ifelse(grepl('Del',x), 'Del', 'Amp')),
Region)
#calculate their Frequency
list(df, df_peaks)
}
Peaks_Freq<-function(hist='LUSC'){
##Calculate their Frequency
Freq=read.csv(paste('4.Results/GISTIC_results/Trimmed_GISTIC_Results/Freq_',
hist,'.csv', sep=''))
Freq$cyto_AA=gsub(' ','',sapply(Freq$cyto_AA, function(x) strsplit(as.character(x),'\\|')[[1]][1]))
Freq$cyto_EA=gsub(' ','',sapply(Freq$cyto_EA, function(x) strsplit(as.character(x),'\\|')[[1]][1]))
CNV_hist=CNV[,na.omit(match(mat$acc[mat$hist==hist], colnames(CNV)))]
Freq_AAPeaks=t(sapply(split(Freq[,c(6,7,8,9,10,11)], Freq$cyto_AA),
function(x) apply(x[,-(1:2)], 2, median) ))
Freq_EAPeaks=t(sapply(split(Freq[,c(6,7,8,9,10,11)], Freq$cyto_EA),
function(x) apply(x[,-(1:2)], 2, median) ))
#For AA
ForAA=return_peaks(hist, race='AA')[[2]]
Freq_AAPeaks=Freq_AAPeaks[na.omit(match(as.character(ForAA$Region), rownames(Freq_AAPeaks))),]
ForAA=ForAA[!is.na(match(as.character(ForAA$Region), rownames(Freq_AAPeaks))),]
ForAA=cbind(ForAA, Freq_AAPeaks)
ForEA=return_peaks(hist, race='EA')[[2]]
Freq_EAPeaks=Freq_EAPeaks[na.omit(match(as.character(ForEA$Region), rownames(Freq_EAPeaks))),]
ForEA=ForEA[!is.na(match(as.character(ForEA$Region), rownames(Freq_EAPeaks))),]
ForEA=cbind(ForEA, Freq_EAPeaks)
df_AAPeaks=sapply(split(Freq$X, Freq$cyto_AA),
function(x) apply(CNV_hist[as.character(x),], 2, median) )
df_AA=cbind(mat[!is.na(match(mat$acc, colnames(CNV_hist))),],df_AAPeaks)
df_EAPeaks=sapply(split(Freq$X, Freq$cyto_EA),
function(x) apply(CNV_hist[as.character(x),], 2, median) )
df_EA=cbind(mat[!is.na(match(mat$acc, colnames(CNV_hist))),], df_EAPeaks)
list(ForAA, ForEA, df_AA, df_EA)
}
Filter2_AA_specific_recurrent<-function(ForAA, ForEA){
AA_Peaks=rownames(ForAA)
EA_Peaks=rownames(ForEA)
AA_Peaks=gsub(' ','',sapply(AA_Peaks, function(x) strsplit(x, '_')[[1]][2]))
EA_Peaks=gsub(' ','',sapply(EA_Peaks, function(x) strsplit(x, '_')[[1]][2]))
AA_Peaks_wdLocation=cbind(AA_Peaks, cytobands[match(AA_Peaks, cytobands$loc),])
EA_Peaks_wdLocation=cbind(EA_Peaks, cytobands[match(EA_Peaks, cytobands$loc),])
K=1
ForAA[rownames(AA_Peaks_wdLocation[which(!as.logical(sapply(1:nrow(AA_Peaks_wdLocation), function(K)
tenpercentRegion_test(EA_Peaks_wdLocation, AA_Peaks_wdLocation[K,])))),]),]
}
tenpercentRegion_test<-function(listA=EA_Peaks_wdLocation, region=AA_Peaks_wdLocation[K,], extra=0.1){
listA$chromStart= as.numeric(listA$chromStart)
listA$chromEnd= as.numeric(listA$chromEnd)
test_location= data.frame(Chr=region$Chr,
start=max(0, region$chromStart-(extra*region$chromStart)),
end=region$chromEnd+(extra*region$chromEnd),
Type=rownames(region))
listA=listA[listA$Chr==test_location$Chr &
(grepl('Del',rownames(listA))==grepl('Del',test_location$Type)) ,]
sum(listA[,3]>test_location$start & listA[,3]<test_location$end |
listA[,4]>test_location$start & listA[,4]<test_location$end)
}
All_Steps<-function(hist='LUAD', K=1, whether_leg='none', return_plot=FALSE, numPlots=2){
ForAA=return_peaks(hist, race='AA')[[2]]
ForEA=return_peaks(hist, race='EA')[[2]]
#Filter 1: at least 5% of the population
ForAA=ForAA[ForAA$Frequency>0.1,]
#Filter 2: AA-specific recurrent
ForAA_specific=Filter2_AA_specific_recurrent(ForAA, ForEA)
#Preprocessing
ForAA_wdFreq=Peaks_Freq(hist)[[1]][rownames(ForAA_specific),]
ForAA_Amp=ForAA_wdFreq[grep('Amp',rownames(ForAA_wdFreq)),c(1:5, 6,7)]
colnames(ForAA_Amp)[6:7]=c('Freq_in_EA', 'Freq_in_AA')
ForAA_Del=ForAA_wdFreq[grep('Del',rownames(ForAA_wdFreq)),c(1:5, 8,9)]
colnames(ForAA_Del)[6:7]=c('Freq_in_EA', 'Freq_in_AA')
ForAA=rbind(ForAA_Amp, ForAA_Del)
#Filter 3: K times more in AA
ForAA=ForAA[ForAA$Freq_in_AA>(ForAA$Freq_in_EA*2),]
ForAA=ForAA[order(ForAA$Freq_in_AA-ForAA$Freq_in_EA, decreasing = T),]
ForAA=ForAA[order(ForAA$Freq_in_AA/ForAA$Freq_in_EA, decreasing = T),]
# ##Assocaited with Srrvival
# df=cbind(Peaks_Freq(hist)[[3]][,1:5],
# Peaks_Freq(hist)[[3]][,as.character(ForAA$Region)])
# binarized_df=cbind(df[,1:5],
# apply(df[,as.character(ForAA$Region)[ForAA$Type=='Amp']], 2,
# function(x) as.numeric(x> 0.1)),
# apply(df[,as.character(ForAA$Region)[ForAA$Type=='Del']], 2,
# function(x) as.numeric(x< -0.1))
# )
# binarized_df=binarized_df[,!apply(binarized_df, 2, function(x) sum(is.na(x))>1 )]
# surv_info=data.frame(t(sapply(colnames(binarized_df)[-c(1:5)], function(x)
# summary(coxph(Surv(survival_2017, lungcancer_death_all_years_2017) ~
# factor(binarized_df[,x]), data=binarized_df))$coefficients[1,c(2,5)])))
# colnames(surv_info)=c('HR', 'P')
# regions_OI=gsub('X', '',rownames(surv_info)[which(surv_info$P<0.1 & surv_info$HR>1)])
# if(length(which(surv_info$P<0.1 & surv_info$HR>1))>0 ){
# fit1=survfit(Surv(survival_2017, lungcancer_death_all_years_2017) ~
# binarized_df[,colnames(binarized_df)==regions_OI[K]],
# data=binarized_df)
# ggsurv1<-
# ggsurvplot(fit1,
# data = binarized_df,
# pval = FALSE,
# xlab = "Time in days",
# ggtheme = theme_bw(base_size = 24),
# legend.title='CNV',
# legend.labs=c('No Change',
# #as.character(ForAA$Type[ForAA$Region==paste(regions_OI[K])])
# 'Del'
# ),
# palette = c("black", "green"),
# legend=whether_leg
# )
# ggsurv1$plot <- ggsurv1$plot +
# theme(
# # axis.title.x = element_blank(),
# # axis.title.y = element_blank(),
# legend.key.size = unit(2.5,'line'),
# legend.text = element_text(size=15))+
# annotate("text",
# x = 500, y = 0.23, # x and y coordinates of the text
# label = paste("p < ", round(ForAA$P.Value[which(ForAA$P.Value<0.1)][K],3)),
# size = 5)
# }
# if(return_plot){return(ggsurv1)
# } else {
return(ForAA)
# }
}
#########################
##Step 0: Call Functions
#########################
Panel2=All_Steps(hist='LUSC',
K=1,
'top',
return_plot=TRUE)
leg=get_legend(All_Steps(hist='LUSC', 1, 'top', return_plot=TRUE))
######################################
####Plot Freq plot
######################################
hist='LUSC'
df_OI=data.frame(All_Steps(hist, 1, 'top', return_plot=FALSE, numPlots = 10), hist)
df2plot=data.frame(Peak=rep(paste(df_OI$Type,
df_OI$Region), each=2),
Freq=c(apply(df_OI, 1, function(x) as.numeric(x[c(6,7)]) )),
race=rep(c('EA', 'AA'), nrow(df_OI)),
Hist=rep(df_OI$hist, each=2))
colorType_Set='Set1'
Panel1=ggplotGrob(
ggplot(df2plot, aes(y=Freq, x=race, fill=race))+
geom_bar(stat = 'identity')+
labs(x="Race", y = "Frequency")+
theme_bw(base_size = 20)+
guides(fill=FALSE)+
scale_fill_brewer(palette=colorType_Set)+
facet_grid(Peak~Hist, scales = 'free', switch = 'y')+
#geom_text(data=Pvalues, aes(x=xstar, y=ystar, label=p_values), size=8)+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank(),
strip.text.y = element_text(size = 30),
strip.text.x = element_text(size = 30)
#strip.background.y = element_blank(),
#strip.text.y = element_blank())
))
tiff(paste('/Users/sinhas8/Project_Chromotrypsis/prep_final_figures/Figure3Bv2_',hist,'Comp_Supp',Sys.Date(),'.tif'),
width=300, height=(nrow(df2plot)/2)*250 )
plot(Panel1)
dev.off()
######################################
####Putting em together
######################################
tiff('/Users/sinhas8/Project_Chromotrypsis/prep_final_figures/Figure3Bv1.tif',
width=700, height=700)
lay=rbind(c(NA,2),
c(1,2), c(1,2), c(1,2), c(1,2),
c(1,2), c(1,2), c(1,2), c(1,2), c(1,2))
grid.arrange(Panel1,
plot_grid(leg, Panel2, ncol=1, rel_heights = c(1/10, 9/10)),
layout_matrix=lay)
dev.off()
######################################
####Population overlap
######################################
population_overlap<-function(hist='LUAD', K=1){
ForAA=return_peaks(hist, race='AA')[[2]]
ForEA=return_peaks(hist, race='EA')[[2]]
total_regions=nrow(ForAA)+nrow(ForEA)
total_regions_Amp=nrow(ForAA[ForAA$Type=='Amp',])+nrow(ForEA[ForEA$Type=='Amp',])
total_regions_Del=nrow(ForAA[ForAA$Type=='Del',])+nrow(ForEA[ForEA$Type=='Del',])
#Filter 2: AA-specific recurrent
ForAA_specific=Filter2_AA_specific_recurrent(ForAA, ForEA)
ForAA_specific_Amp=Filter2_AA_specific_recurrent(ForAA[ForAA$Type=='Amp',], ForEA[ForEA$Type=='Amp',])
ForAA_specific_Del=Filter2_AA_specific_recurrent(ForAA[ForAA$Type=='Del',], ForEA[ForEA$Type=='Del',])
nrow(ForAA_specific_Amp)
nrow(ForAA_specific_Del)
return( list( ((nrow(ForAA)-nrow(ForAA_specific))*2)/total_regions,
c(Amp_AA=nrow(ForAA[ForAA$Type=='Amp',]), Amp_EA=nrow(ForEA[ForEA$Type=='Amp',]),
Amp_common= nrow(ForAA[ForAA$Type=='Amp',]) - nrow(ForAA_specific_Amp) ),
c(Del_AA=nrow(ForAA[ForAA$Type=='Del',]), Del_EA=nrow(ForEA[ForEA$Type=='Del',]),
Del_common= nrow(ForAA[ForAA$Type=='Del',]) - nrow(ForAA_specific_Del)) )
)
}
LUAD_ovp=population_overlap('LUAD')
LUSC_ovp=population_overlap('LUSC')