-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworking_deg.R
257 lines (221 loc) · 10.4 KB
/
working_deg.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
setwd("C:/Users/rylash/OneDrive - Michigan Medicine/Desktop/RNAseq_anals")
#gene_expected<- read.table('gene_expected_count.csv', header=TRUE, sep=',', fill=TRUE)
#gene_tpm<-read.table('gene_tpm_annot.csv', header=TRUE, sep=',', fill=TRUE)
#gene_fpkm<-read.table('gene_fpkm_annot.csv', header=TRUE, sep=',', fill=TRUE)
###### THIS IS CODE FROM THE AGC FINAL REPORT FOUND IN 'report_final.html'#######
data <- read.delim("gene_expected_count.csv", sep=',', row.names = NULL)# Deal with genes that don't have annotated gene symbols (external_gene_name)
# Use ENSEMBL ID if gene symbol not available
data$external_gene_name <- ifelse(
data$external_gene_name == ".",
data$gene_id,
data$external_gene_name
)
# Deal with duplicated gene symbols
# Combine gene symbol with ENSEMBL ID if non-unique
data$external_gene_name <- ifelse(
duplicated(data$external_gene_name),
paste(data$external_gene_name, data$gene_id, sep="_"),
data$external_gene_name
)
# Then we can use the gene symbol column as the row names,
# and subset the count data for further analysis
rownames(data) <- data$external_gene_name
count.data <- data[,5:ncol(data)] # All columns after 4 are count data
########## Subset counts data to include only useful samples ####################
count.data2<-count.data[,c('X9821.RL.1', 'X9821.RL.3', 'X9821.RL.5', 'X9821.RL.6')]
View(count.data2)
#create column data frame with treatment information for deseq2
col_data2 <- data.frame(
sample_ID = c('X9821.RL.1', 'X9821.RL.3', 'X9821.RL.5', 'X9821.RL.6'),
treatment = c('sham', 'lesion', 'sham', 'lesion')
)
#treatments need to be designated as factors
col_data2$treatment <- factor(col_data2$treatment)
col_data2$treatment <- relevel(col_data2$treatment, ref = "sham")
# ########Running Sleuth
# library(sleuth)
# sample_to_covariates <- list(
# X9821.RL.1 = c(treatment = "sham"),
# X9821.RL.3 = c(treatment = "lesion"),
# X9821.RL.5 = c(treatment = "sham"),
# X9821.RL.6 = c(treatment = "lesion")
# )
# col_data2$treatment <- factor(col_data2$treatment)
#
# # Prepare count data with sleuth
# so <- sleuth_prep(count_data = count.data2,
# sample_table = col_data2,
# sample_to_covariates = sample_to_covariates,
# gene_mode = TRUE)
# so <- sleuth_prep(count_data = count.data2,
# sample_table = col_data2,
# gene_mode = TRUE)
#Running DESeq2
library(DESeq2)
dds2 <- DESeqDataSetFromMatrix(countData = count.data2,
colData = col_data2,
design = ~ treatment)
dds2<-DESeq(dds2)
results_deseq<-results(dds2)
#dds$design <- ~ treatment
BiocManager::install('apeglm')
write.csv(as.data.frame(significant_genes_p1),
file="results_deseq_p1.csv")
# res01 <- results(dds2, alpha=0.01)
# summary(res01)
# res001 <- results(dds2, alpha=0.001)
# summary(res001)
resLFC <- lfcShrink(dds2, coef="treatment_sham_vs_lesion", type="apeglm")
resLFC
plotMA(resLFC)
plotMA(results_deseq)
plotMA(results_deseq, xlab='mean of normalized counts', ylim(-8,8), colNonSig='navy',colSig='orangered')
plotCounts(dds2, gene=which.min(results_deseq$padj), intgroup="condition")
################# Create Volcano Plot############################################
with(results_deseq, plot(log2FoldChange, -log10(padj),
main="Volcano Plot",
xlim=c(-10, 10),
ylim=c(0, 8),
pch=20,
col=ifelse(padj < 0.05, "red", "black")))
library(ggplot2)
results_df<-as.data.frame(results_deseq)
ggplot(results_df, aes(x = log2FoldChange, y = -log10(padj), color = factor(results_df$padj < 0.05))) +
geom_point(size = 1.5, alpha = 0.7) +
scale_color_manual(values = c("navy", "orangered")) +
labs(x = "Log2Fold Change",
y = "-Log10(Adjusted p-value)",
color = "Significant (padj<0.05)") +
theme_light()+
xlim(-10,10)+
ylim(0,15)+
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "hotpink") +
geom_hline(yintercept = -log10(0.01), linetype = "dashed", color = "red")+
geom_hline(yintercept = -log10(0.001), linetype = "dashed", color = "darkred")+# Add horizontal line at -log10(0.05)
geom_text(data = subset(results_df, rownames(results_df) %in% c("Cacna1h", 'Slc17a7', "Sst",'Gabbr2')),
aes(label = rownames(subset(results_df, rownames(results_df) %in% c("Cacna1h", 'Slc17a7', "Sst",'Gabbr2')))),
vjust = -1, hjust = 0.4, size = 4, color = "black")
ggplot(results_deseq, aes(x = log2FoldChange, y = -log10(padj), color = factor(results_deseq$padj < 0.05))) +
geom_point(size = 1.5, alpha = 0.7) +
scale_color_manual(values = c("navy", "orangered")) +
labs(x = "Log2 Fold Change",
y = "-Log10(Adjusted p-value)",
title = "Volcano Plot of DEGs",
color = "Significant (padj<0.05)") +
theme_light()+
xlim(-10,10)+
ylim(0,15)+ geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "hotpink") # Add horizontal line at -log10(0.05)
################Subsetting significant genes based on p-values###################
significant_genes_p1 <- subset(results_deseq, padj < .1)
significant_genes_p05b <- subset(results_deseq, padj < .05)
significant_genes_p01 <- subset(results_deseq, padj < .01)
significant_genes_p001 <- subset(results_deseq, padj < .001)
plotMA(results_deseq)
# Add labels for significant genes ##Currently doesn't work
#significant_genes_01 <- results_deseq[results_deseq$padj < 0.01, ]
#text(significant_genes_001$log2FoldChange, -log10(significant_genes_001$padj), labels=rownames(significant_genes), col="red", cex=0.7)
#head(results_deseq)
###GENERATING SUBSET DDS BASED ONLY ON GENES WITH DESIGNATED SIGNIFICANT P VALUE###
DE_genes_names <- rownames(res001)
DE_genes_names <- rownames(significant_genes_p001)
# Subset the original DESeqDataSet based on differentially expressed genes
subset_dds <- dds2[DE_genes_names, ]
results_sub <- results(subset_dds)
############################Heatmap code#########################################
library(pheatmap)
###Subsetting for log fold change
log_fold_changes <- results_sub$log2FoldChange
gene_names <- rownames(subset_dds)
# Create a data frame with the log-fold changes and gene names
heatmap_data <- data.frame(LogFC = log_fold_changes, Gene = DE_genes_names)
pheatmap(heatmap_data)
pheatmap(heatmap_data,
clustering_distance_cols = 'euclidean',
main = "Heatmap of DEGs (p<0.001)")
### Assuming subset_dds is your DESeqDataSet###
heatmap_matrix <- assay(subset_dds, name=)
#heatmap_head<-head(heatmap_matrix,2:20)
# Scale the counts (optional, but can be useful for visualization)
scaled_matrix <- scale(heatmap_matrix)
#heatmap_matrix_sg2<-assay(significant_genes2)
# Create the heatmap ##NEED TO FIX THE SCALING
pheatmap(heatmap_matrix,
cluster_rows = TRUE,
cluster_cols = TRUE,
clustering_distance_cols = 'euclidean',
color = colorRampPalette(c("navy", "white", "orangered"))(100),
scale='row',
fontsize_row = 8,
main = "Heatmap of DEGs (p<0.001)")
#######PCA EXPLORER#####
###################TSNE plots####################################################
library(Rtsne)
# Transform the DESeqDataSet using rlog or vst
dds_transformed <- rlog(dds2) # or vst(dds)
# Extract the transformed data matrix
data_matrix <- assay(dds_transformed)
# Run t-SNE
tsne_result <- Rtsne(data_matrix, dims = 2, perplexity = 10, check_duplicates = FALSE)
# Create a data frame with the t-SNE coordinates
tsne_df <- as.data.frame(tsne_result$Y)
# Plot the t-SNE plot
plot(tsne_df, col = col_data2$treatment, pch = 16, main = "t-SNE Plot")
legend("topright", legend = levels(col_data2$treatment), col = 1:2, pch = 16, title = "Treatment")
##########################UMAP Plots ############################################
library(umap)
dds_norm <- vst(dds2)
normalized_counts <- assay(dds_norm)
normalized_counts<-t(normalized_counts)
umap_result<-umap::umap(normalized_counts, n_neighbors=2)
umap.defaults
# Assuming dds is your DESeqDataSet
dds_transformed <- rlog(dds2) # or vst(dds)
data_matrix <- assay(dds2)
data_matrix2<-t(data_matrix)
library(umap)
# Run UMAP
umap_result <- umap(data_matrix, n_neighbors = 15, n_components = , min_dist = 0.25)
# Create a UMAP plot
umap_df <- as.data.frame(umap_result$layout)
plot(umap_df, col = col_data2$treatment, pch = 16, main = "UMAP Plot")
legend("topright", legend = levels(col_data$treatment), col = 1:2, pch = 16, title = "Treatment")
######################GO and KEGG Enrichment Analysis############################
BiocManager::install("clusterProfiler")
BiocManager::install("org.Rn.eg.db")
library(clusterProfiler)
library(org.Rn.eg.db)
# Example gene list for rats
rat_gene_list <- c("Gene1", "Gene2", "Gene3", ...)
# Perform KEGG enrichment analysis for rats
kegg_enrich_rat <- enrichKEGG(gene = significant_genes_p001, organism = "rno", keyType = "org.Rn.eg", pvalueCutoff = 0.05)
print(kegg_enrich_rat)
###############RUNNING DESeq2 on entire counts data set##########################
library('DESeq2')
str(count.data)
##set up matrix for deseq dds variable, counts data is finished
#col data needs to be created as a two level factor variable
#and ran for construction of dds variable to complete analysis
col_data <- data.frame(
sample_ID = c('X9821.RL.1', 'X9821.RL.2', 'X9821.RL.3', 'X9821.RL.4', 'X9821.RL.5', 'X9821.RL.6'),
treatment = c('sham', 'lesion', 'lesion', 'sham', 'sham', 'lesion')
)
col_data$treatment <- factor(col_data$treatment)
dds <- DESeqDataSetFromMatrix(countData = count.data,
colData = col_data,
design = ~ treatment)
#dds$design <- ~ treatment
dds <- DESeq(dds)
results_deseq <- results(dds)
significant_genes2 <- subset(results_deseq, padj < .5)
plotMA(results_deseq)
if (!requireNamespace("EnhancedVolcano", quietly = TRUE)) {
install.packages("EnhancedVolcano")
}
library(EnhancedVolcano)
plot(results_deseq, main="Volcano Plot", xlim=c(-2,2), ylim=c(0,8))
with(results_deseq, plot(log2FoldChange, -log10(padj),
main="Volcano Plot",
xlim=c(-10, 10),
ylim=c(0, 8),
pch=20,
col=ifelse(padj < 0.7, "red", "black")))