-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path#_GseaGo_GSEAGenesets_Customize.R
210 lines (156 loc) · 8.18 KB
/
#_GseaGo_GSEAGenesets_Customize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
## MSigDB Collections from GSEA
## http://www.gsea-msigdb.org/gsea/msigdb/collections.jsp
##### Presetting ######
rm(list = ls()) # Clean variable
memory.limit(150000)
##### Load Packages #####
if(!require("tidyverse")) install.packages("tidyverse")
library(tidyverse)
if(!require("gtools")) install.packages("gtools")
library(gtools)
##### Condition setting* #####
Set_Species = "Hs" #"Hs","Mm" # Homo sapiens(HS);"Mus musculus"(Mm)
Set_LoadGeneBy = c("Default","symb") # c("Default","symb"),c("Default","entrez"),"Customize"
Set_UpdateGeneName = TRUE # FALSE
Output_FileName_KW <- "EMT" # Export file name of key word(KY)
Set_Fiter = TRUE # FALSE
Set_Fiter_KW.lt = list("EMT",c("trans","epithelial"), c("trans","epithelial","GOBP")) # "Default"
Output_FileName_CTGY <- "C2" # Export file name of Category(CTGY)
Set_Fiter_CTGY <- "C2" # "Default"
## -[] Add setting record
##### Current path and new folder setting* #####
Output_FileName_Desc <- "ComB"
Output_FileName <- paste0(Output_FileName_Desc,"_",Set_Species)
if(Set_LoadGeneBy[1] == "Customize"){
InputFolder = "Cust_GSEA_Genesets_Test"
}else if(Set_LoadGeneBy[1] == "Default"){
InputFolder = "Input_Genesets"
}
OutputFolder <- paste0("Input_Genesets/", InputFolder,"_", Output_FileName)
dir.create(OutputFolder) ## Generate output folder
##### Import files & Combine df #####
#### Import default RData ####
load(paste0("Input_Genesets/Genesets_Default.RData"))
## clean up objects
if(Set_Species == "Hs"){
rm(list = str_subset(objects(), pattern = "_Mm_"))
}else if(Set_Species == "Mm"){
rm(list = str_subset(objects(), pattern = "_Hs_"))
}else{
print("Please Set Species in Hs or Mm")
}
if(Set_LoadGeneBy[1] == "Default" && Set_LoadGeneBy[2] =="symb"){
rm(list = str_subset(objects(), pattern = "_entrez_"))
}else if(Set_LoadGeneBy[1] == "Default" && Set_LoadGeneBy[2] == "entrez"){
rm(list = str_subset(objects(), pattern = "_symb_"))
}else if(Set_LoadGeneBy[1] == "Customize"){
rm(list = str_subset(objects(), pattern = "_entrez_"))
rm(list = str_subset(objects(), pattern = "_symb_"))
}
assign("GSEAGeneSet_MetaData.df", get(str_subset(objects(), pattern = "_XML.df")))
rm(list = str_subset(objects(), pattern = "_XML.df"))
if(Set_LoadGeneBy[1] == "Default"){
assign("GSEAGeneSet.df", get(str_subset(objects(), pattern = "_gmt.df")))
rm(list = str_subset(objects(), pattern = "_gmt.df"))
}else if(Set_LoadGeneBy[1] == "Customize"){
#### Import Customization ####
# target.dir <- list.dirs( paste0("Input_Genesets/", InputFolder) )[-1]
FilesList.set <- list.files(paste0("Input_Genesets/", InputFolder),full.names = T) %>%
str_subset(., pattern = "\\.gmt$")
Nfiles = length(FilesList.set)
for(i in 1:Nfiles){
if(i==1){
# Deal with different number of columns
GSEAGeneSet.df <- read.delim2(FilesList.set[1],
col.names = 1:max(count.fields(FilesList.set[1])),
header = F,sep = "\t")
}else{
new_1 <- read.delim2(paste0(FilesList.set[i]),
col.names = 1:max(count.fields(FilesList.set[i])),
header = F,sep = "\t")
GSEAGeneSet.df <- smartbind(GSEAGeneSet.df,new_1)
}
}
rm(new_1,i)
}
#### Clean up df ####
## Remove duplicated
GSEAGeneSet.df <- GSEAGeneSet.df[!duplicated(GSEAGeneSet.df[,2]), ]
# ## Remove NA (Have set in the write.table) # Ref: https://www.delftstack.com/zh-tw/howto/r/replace-na-with-0-in-r/
# GSEAGeneSet.df[is.na(GSEAGeneSet.df)] <- ""
##### Update gene name ####
##### Export Result of Combine #####
## Note ## Need to remove the quote
# write.table(GSEAGeneSet.df,paste0(OutputFolder,"/",InputFolder,'_',Output_FileName ,'.txt'),
# row.names = FALSE,col.names= FALSE,quote = FALSE, sep = '\t', na="")
write.table(GSEAGeneSet.df,paste0(OutputFolder,"/",InputFolder,'_',Output_FileName ,'.gmt'),
row.names = FALSE,col.names= FALSE,quote = FALSE, sep = '\t', na="")
##### Filter by Keywords* #####
Output_FileName_KW <- "EMT" # Export file name
Keyword.lt <- list("EMT",c("trans","epithelial"), c("trans","epithelial","GOBP"))
for(i in 1:length(Keyword.lt)){
if(length(Keyword.lt[[i]])==1){
GSEAGeneSet_FLT_Temp.df <- GSEAGeneSet.df[grepl(Keyword.lt[[i]],GSEAGeneSet.df[,1], ignore.case=TRUE),]
}else if(length(Keyword.lt[[i]])==2){
GSEAGeneSet_FLT_Temp.df <- GSEAGeneSet.df[grepl(Keyword.lt[[i]][1],GSEAGeneSet.df[,1], ignore.case=TRUE)
& grepl(Keyword.lt[[i]][2],GSEAGeneSet.df[,1], ignore.case=TRUE),]
}else if(length(Keyword.lt[[i]])==3){
GSEAGeneSet_FLT_Temp.df <- GSEAGeneSet.df[grepl(Keyword.lt[[i]][1],GSEAGeneSet.df[,1], ignore.case=TRUE)
& grepl(Keyword.lt[[i]][2],GSEAGeneSet.df[,1], ignore.case=TRUE)
& grepl(Keyword.lt[[i]][3],GSEAGeneSet.df[,1], ignore.case=TRUE),]
}else{
GSEAGeneSet_FLT_Temp.df <- GSEAGeneSet.df[grepl(Keyword.lt[[i]][1],GSEAGeneSet.df[,1], ignore.case=TRUE)
& grepl(Keyword.lt[[i]][2],GSEAGeneSet.df[,1], ignore.case=TRUE)
& grepl(Keyword.lt[[i]][3],GSEAGeneSet.df[,1], ignore.case=TRUE),]
print(paste0("In",i,": Only the first 3 elements will be used")) ## 可以嘗試用條件式+迴圈的方式 ##整體改成用Apply寫
}
if(i==1){
GSEAGeneSet_FLT.df <- GSEAGeneSet_FLT_Temp.df
}else{
GSEAGeneSet_FLT.df <- smartbind(GSEAGeneSet_FLT.df,GSEAGeneSet_FLT_Temp.df)
}
}
rm(i,GSEAGeneSet_FLT_Temp.df)
#### Clean up df ####
## Remove duplicated
GSEAGeneSet_FLT.df <- GSEAGeneSet_FLT.df[!duplicated(GSEAGeneSet_FLT.df[,2]), ]
##### Export Result #####
## Note ## Need to remove the quote
# write.table(GSEAGeneSet_FLT.df,paste0(OutputFolder,"/",InputFolder,'_',Output_FileName,'_',Output_FileName_KW ,'.txt'),
# row.names = FALSE,col.names= FALSE,quote = FALSE, sep = '\t', na="")
write.table(GSEAGeneSet_FLT.df,paste0(OutputFolder,"/",InputFolder,'_',Output_FileName,'_',Output_FileName_KW ,'.gmt'),
row.names = FALSE,col.names= FALSE,quote = FALSE, sep = '\t', na="")
### (pending) How to add conditions to a logical vector with a loop [r]
## Intersect all
## Ref: https://stackoverflow.com/questions/8817533/loop-of-a-loop-in-r
## Add conditions to a logical vector with a loop [r]
## https://stackoverflow.com/questions/40994881/add-conditions-to-a-logical-vector-with-a-loop-r
#################################################################################################################################
##### Choose specific Genesets* #####
Output_FileName_SPEC = "SPEC" # Export file name
Int_Path.set <- c(
"REACTOME_ACTIVATION_OF_ATR_IN_RESPONSE_TO_REPLICATION_STRESS",
"REACTOME_NUCLEAR_PORE_COMPLEX_NPC_DISASSEMBLY",
"HALLMARK_E2F_TARGETS",
"REACTOME_DNA_REPLICATION",
"REACTOME_G2_M_CHECKPOINTS",
"KEGG_OLFACTORY_TRANSDUCTION",
"KEGG_CALCIUM_SIGNALING_PATHWAY"
)
GSEAGeneSet_SPEC.df <- GSEAGeneSet.df[GSEAGeneSet.df[,1] %in% Int_Path.set,]
##### Export Result #####
## Note ## Need to remove the quote
# write.table(GSEAGeneSet_SPEC.df, paste0(OutputFolder,"/",InputFolder,'_',Output_FileName,'_',Output_FileName_SPEC ,'.txt'),
# row.names = FALSE,col.names= FALSE,quote = FALSE, sep = '\t', na="")
write.table(GSEAGeneSet_SPEC.df, paste0(OutputFolder,"/",InputFolder,'_',Output_FileName,'_',Output_FileName_SPEC ,'.gmt'),
row.names = FALSE,col.names= FALSE,quote = FALSE, sep = '\t', na="")
#### Save RData ####
save.image(paste0(OutputFolder,"/",InputFolder,'_', Output_FileName,".RData"))
#################################################################################
#### TO-do list ####
## 加入Category篩選條件
## 創建Geneset by 自己的實驗或線上數據(文字型,matrix型)
## 內文文字篩選
## 更新基因名稱
## 條件篩選可以嘗試用條件式+迴圈的方式
## 整體改成用Apply寫