-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsurv_RNAseq.R
171 lines (134 loc) · 6.4 KB
/
surv_RNAseq.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
##################################
# surv_RNAseq for R #
# Created by Mathias Husted Torp #
# 24/06-2016 #
##################################
# Choose cancer types and genes
mycancers <- FALSE # Vector of cancers to perform the analysis on. FALSE = ALL
mygenes <- c('ASS1', 'ARG1', 'ARG2', 'SLC7A1', 'SLC7A2') # Required. Vector of genes to perform the analysis on. E.g. c('TP53', 'TP53,ASS1')
options(echo=T)
library(survival) # Support Kaplan-Meier methods
library(data.table, quietly = TRUE) # Support fread
source("functions.R")
# Other arguments
trimxaxs <- FALSE # Trim the x-axis of each plot, or keep the same length for all
legendpos <- "bottomleft" # Position of the legend box. ("bottomleft" or "topright")
forceunit <- "years" # Force a time unit ("days", "months", "years")
pairwise <- TRUE # Do pairwise analysis of all combinations of the input genes
sdweight <- "none" # ("none", "half", "full", number) Omit patients with expression close to the median
signif <- 0.05 # Cases below this P-value are plotted. Set to 1 to plot all. Default: 0.05
# Folder structure
datadir <- 'Formatted'
outdir <- 'Output'
if(sdweight == 'None') { sdweight <- 0
} else if ( sdweight == 'Half' ) { sdweight <- 0.5
} else if ( sdweight == 'Full' ) { sdweight <- 1.0
}
if( mygenes == '' || !exists('mygenes') ) {
stop('No genes specified')
}
# Get list of all genes in the analysis
uniquegenes <- unique(unlist(strsplit(mygenes, ',', fixed = TRUE)))
if( isFALSE(mycancers) ) {
allcancers <- dir( datadir )
mycancers = allcancers
outfile = paste0(outdir, '/All_', paste0(uniquegenes, collapse = '-'), '.pdf' )
} else {
outfile <- paste0(outdir, '/',
paste0(mycancers, '_', collapse = '_'),
paste0(uniquegenes, collapse = '-'), '.pdf' )
}
if( isTRUE(pairwise) ) {
combinations <- combn(x = uniquegenes, m = 2, paste, collapse = ',')
mygenes <- append(uniquegenes, values = combinations, after = length(uniquegenes))
## Append '_pairwise' to outfile name
outfile <- gsub('^([^[:space:]]+)(\\.pdf)$', '\\1_pairwise\\2', outfile)
}
pdf(file = outfile, width=11, height=8.5, pointsize=12, paper='special')
for(cancer in mycancers) {
cat(paste("Cancer:", cancer, "\r\n"))
survfile <- paste(datadir, cancer, 'survival.txt', sep = '/')
esetfile <- paste(datadir, cancer, 'expression.txt', sep = '/')
clinfile <- paste(datadir, cancer, 'clinical.txt', sep = '/')
#### Read eset file ####
eset <- fread(input = esetfile, sep = "\t", header = TRUE, select = c("TCGA_Barcode", uniquegenes), showProgress = FALSE )
if( length(uniquegenes) != length(eset) - 1 ) {
stop(paste("The gene", uniquegenes[!uniquegenes %in% names(eset)], "is not in the data set"))
}
# Remove outliers based on PCA
eset <- removeoutliers('Outliers.txt', eset)
# Extract names and convert to survival-type
name <- substr(eset$TCGA_Barcode, 0, 12)
npatients <- length(name)
#### Read survival file ####
surv <- read.table(file = survfile, header = TRUE, sep = "\t", row.names = NULL,
colClasses = c('character', 'integer', 'integer'),
col.names = c('Barcode', 'Time', 'Status')
)
#### Read clinical file ####
clin <- read.table(file = clinfile, header = TRUE, sep = "\t",
na.strings = c('[Not Evaluated]', '[Not Available]', '[Unknown]', '[Discrepancy]', '[Not Applicable]'), quote = "")
#### Remove patients with no survival data
clin <- clin[clin$barcode %in% surv$Barcode, ]
#### Remove patients with invalid serial times
clin <- clin[surv$Time > 0, ]
surv <- surv[surv$Time > 0, ]
# if( isTRUE(pairwise) ) {
# genes <- append(mygenes, after=length(mygenes), combn(mygenes, m = 2, FUN = paste, collapse=','))
# }
for(i in 1:length(mygenes)) {
# Check for paired test
gene <- unlist(strsplit(mygenes[i], ',', fixed = TRUE))
#### Separate cohort based on expression of gene ####
group <- splitdata(eset, gene, surv$Barcode)
if(length(gene) == 1) {
nlow <- sum(group == "LOW" )
nhigh <- sum(group == "HIGH")
# Skip plot if all groups are empty
try(checkforemptygroups(nlow, nhigh, cnc = cancer), next)
} else if(length(gene) == 2) {
nlowlow <- sum(group == "LOWLOW" )
nlowhigh <- sum(group == "LOWHIGH" )
nhighlow <- sum(group == "HIGHLOW" )
nhighhigh <- sum(group == "HIGHHIGH")
# Print empty plot, if all groups are empty
try(checkforemptygroups(nlowlow, nlowhigh, nhighlow, nhighhigh), next)
} else {
}
#### Add column "group" to survival file ####
surv$Group <- group
clin$Group <- group
# Remove subjects with no expression group
subsurv <- subset(surv, Group != 0 )
#### Change time unit ####
if(i == 1) {
tunit <- findtimeunit(tail(subsurv$Time, 1), forceunit)
if (tunit == 'years') {
subsurv$Time <- subsurv$Time / 365
maxtime <- tail(subsurv$Time, 1)
} else if (tunit == 'months') {
subsurv$Time <- subsurv$Time / 30.5
maxtime <- tail(subsurv$Time, 1)
}
}
#### Calculate difference between groups ?survdiff ####
temp <- survdiff(Surv(Time, Status) ~ Group, data = subsurv)
#### Calculate p value from chisq ####
pval <- 1 - pchisq(temp$chisq, length(temp$n) - 1)
pval <- signif(pval, digits = 6)
myby <- findmyby(maxtime)
genelabel <- paste(gene, collapse = ' and ')
if( pval <= signif ) {
#### Fit survival ?survfit ####
mysurv <- survfit(Surv(Time, Status) ~ Group, data = subsurv)
#### Plot survfit element ?plot ####
mybeginplot(mysurv, maxtime = maxtime)
myplot(mysurv, maxtime = maxtime)
#### Add legend ?legend ?text ####
addlegend(legendpos)
### Test for confounders ###
# confounders(clin, mygenes[i])
}
}
}
invisible(dev.off())