.Rhistory

library(tidyverse)
gene_data <- read_table("geo2r/data/GSE21942.top.table.tsv")
View(gene_data)
View(gene_data)
library(tidyverse)
gene_data <- read_table("geo2r/data/GSE21942.top.table.tsv")
View(gene_data)
head(gene_data)
library(tidyverse)
library(openxlsx)
# read tables
gene_table <- read_table("geo2r/data/GSE21942.top.table.tsv")
# convert data into factors
gene_table$GeneSymbol <- as.factor(gene_table$GeneSymbol)
gene_table$ID <- as.factor(gene_table$ID)
gene_table$GeneTitle <- as.factor(gene_table$GeneTitle)
over_expressed_genes <- gene_table  |>
filter(adjPVal < 0.05 & logFC > 1) |>
arrange(adjp) |>
head(20)
over_expressed_genes <- gene_table  |>
filter(adjPVal < 0.05 & logFC > 1) |>
arrange(adjPVal) |>
head(20)
View(over_expressed_genes)
# Version info: R 4.2.2, Biobase 2.58.0, GEOquery 2.66.0, limma 3.54.0
################################################################
#   Differential expression analysis with limma
library(GEOquery)
library(limma)
library(umap)
gset <- getGEO("GSE21942", GSEMatrix =TRUE, AnnotGPL=TRUE)
if (length(gset) > 1) idx <- grep("GPL570", attr(gset, "names")) else idx <- 1
gset <- gset[[idx]]
# make proper column names to match toptable
fvarLabels(gset) <- make.names(fvarLabels(gset))
# group membership for all samples
gsms <- "00000000000000011111111111111"
sml <- strsplit(gsms, split="")[[1]]
# log2 transformation
ex <- exprs(gset)
qx <- as.numeric(quantile(ex, c(0., 0.25, 0.5, 0.75, 0.99, 1.0), na.rm=T))
LogC <- (qx[5] > 100) ||
(qx[6]-qx[1] > 50 && qx[2] > 0)
if (LogC) { ex[which(ex <= 0)] <- NaN
exprs(gset) <- log2(ex) }
# assign samples to groups and set up design matrix
gs <- factor(sml)
groups <- make.names(c("Control","MS"))
levels(gs) <- groups
gset$group <- gs
design <- model.matrix(~group + 0, gset)
colnames(design) <- levels(gs)
gset <- gset[complete.cases(exprs(gset)), ] # skip missing values
fit <- lmFit(gset, design)  # fit linear model
# set up contrasts of interest and recalculate model coefficients
cts <- paste(groups[1], groups[2], sep="-")
cont.matrix <- makeContrasts(contrasts=cts, levels=design)
fit2 <- contrasts.fit(fit, cont.matrix)
# compute statistics and table of top significant genes
fit2 <- eBayes(fit2, 0.01)
tT <- topTable(fit2, adjust="fdr", sort.by="B", number=250)
tT <- subset(tT, select=c("ID","adj.P.Val","P.Value","t","B","logFC","GB_ACC","SPOT_ID","Gene.Symbol","Gene.symbol","Gene.title"))
# assumption is that most genes are not differentially expressed.
tT2 <- topTable(fit2, adjust="fdr", sort.by="B", number=Inf)
hist(tT2$adj.P.Val, col = "grey", border = "white", xlab = "P-adj",
ylab = "Number of genes", main = "P-adj value distribution")
# summarize test results as "up", "down" or "not expressed"
dT <- decideTests(fit2, adjust.method="fdr", p.value=0.05, lfc=0)
# Venn diagram of results
vennDiagram(dT, circle.col=palette())
volcano plot (log P-value vs log fold change)
# Venn diagram of results
vennDiagram(dT, circle.col=palette())
# create Q-Q plot for t-statistic
t.good <- which(!is.na(fit2$F)) # filter out bad probes
qqt(fit2$t[t.good], fit2$df.total[t.good], main="Moderated t statistic")
# volcano plot (log P-value vs log fold change)
colnames(fit2) # list contrast names
ct <- 1        # choose contrast of interest
# Please note that the code provided to generate graphs serves as a guidance to
# the users. It does not replicate the exact GEO2R web display due to multitude
# of graphical options.
#
# The following will produce basic volcano plot using limma function:
volcanoplot(fit2, coef=ct, main=colnames(fit2)[ct], pch=20,
highlight=length(which(dT[,ct]!=0)), names=rep('+', nrow(fit2)))
# MD plot (log fold change vs mean log expression)
# highlight statistically significant (p-adj < 0.05) probes
plotMD(fit2, column=ct, status=dT[,ct], legend=F, pch=20, cex=1)
abline(h=0)
################################################################
# General expression data analysis
ex <- exprs(gset)
# box-and-whisker plot
ord <- order(gs)  # order samples by group
palette(c("#1B9E77", "#7570B3", "#E7298A", "#E6AB02", "#D95F02",
"#66A61E", "#A6761D", "#B32424", "#B324B3", "#666666"))
par(mar=c(7,4,2,1))
title <- paste ("GSE21942", "/", annotation(gset), sep ="")
boxplot(ex[,ord], boxwex=0.6, notch=T, main=title, outline=FALSE, las=2, col=gs[ord])
legend("topleft", groups, fill=palette(), bty="n")
# expression value distribution
par(mar=c(4,4,2,1))
title <- paste ("GSE21942", "/", annotation(gset), " value distribution", sep ="")
plotDensities(ex, group=gs, main=title, legend ="topright")
# UMAP plot (dimensionality reduction)
ex <- na.omit(ex) # eliminate rows with NAs
ex <- ex[!duplicated(ex), ]  # remove duplicates
ump <- umap(t(ex), n_neighbors = 12, random_state = 123)
par(mar=c(3,3,2,6), xpd=TRUE)
plot(ump$layout, main="UMAP plot, nbrs=12", xlab="", ylab="", col=gs, pch=20, cex=1.5)
legend("topright", inset=c(-0.15,0), legend=levels(gs), pch=20,
col=1:nlevels(gs), title="Group", pt.cex=1.5)
library("maptools")  # point labels without overlaps
library("maptools")  # point labels without overlaps
# mean-variance trend, helps to see if precision weights are needed
plotSA(fit2, main="Mean variance trend, GSE21942")
# Documentation: https://bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/query.html
library(TCGAbiolinks)
BiocManager::install("TCGAbiolinks")
# Documentation: https://bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/query.html
library(TCGAbiolinks)
library(tidyverse)
library(maftools)
library(pheatmap)
library(SummarizedExperiment)
library(maftools)
BiocManager::install("maftools")
BiocManager::install("SummarizedExperiment")
BiocManager::install("SummarizedExperiment", force = T)
# Documentation: https://bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/query.html
library(tidyverse)
library(TCGAbiolinks)
library(maftools)
library(pheatmap)
library(SummarizedExperiment)
# get a list of projects
gdcprojects <- getGDCprojects()
getProjectSummary('TCGA-BRCA')
View(gdcprojects)
query_TCGA <- GDCquery(project = 'TCGA-BRCA',
data.category = 'Transcriptome Profiling')
output_query_TCGA <- getResults(query_TCGA)
View(output_query_TCGA)
query_TCGA <- GDCquery(project = 'TCGA-BRCA',
data.category = 'Transcriptome Profiling',
experimental.strategy = 'RNA-Seq',
workflow.type = 'STAR - Counts',
access = 'open',
barcode = c('TCGA-LL-A73Y-01A-11R-A33J-07',
'TCGA-E2-A1IU-01A-11R-A14D-07',
'TCGA-AO-A03U-01B-21R-A10J-07'))
getResults(query_TCGA)
# download data - GDCdownload
GDCdownload(query_TCGA)
# prepare data
tcga_brca_data <- GDCprepare(query_TCGA, summarizedExperiment = TRUE)
brca_matrix <- assay(tcga_brca_data, 'fpkm_unstrand')
View(brca_matrix)
# read tables
geneexp_data <- read_table("data/table_degenes_brca.txt")
library(tidyverse)
library(openxlsx)
# read tables
geneexp_data <- read_table("data/table_degenes_brca.txt")
# read tables
geneexp_data <- read_table("gepia2/data/table_degenes_brca.txt")
# read tables
gene_data <- read_table("gepia2/data/table_degenes_brca.txt")
significant_genes <- gene_data  |>
filter(adjp < 0.05 & abs(Log2FoldChange) > 1)
View(significant_genes)
plot(-log10(gene_data$adjp), gene_data$Log2FoldChange,
xlab = "-log10(adjusted p-value)", ylab = "Log2FoldChange",
main = "Volcano Plot", pch = 20,
col = ifelse(abs(gene_data$Log2FoldChange) > 1 & gene_data$adjp < 0.05, "red", "black"))
View(geneexp_data)
BiocManager::install("fgsea")
BiocManager::install("fgsea", force = T)
# Documentation: https://bioconductor.org/packages/release/bioc/vignettes/TCGAbiolinks/inst/doc/query.html
library(tidyverse)
library(TCGAbiolinks)
library(maftools)
library(pheatmap)
library(SummarizedExperiment)
# get a list of projects
gdcprojects <- getGDCprojects()
getProjectSummary('TCGA-BRCA')
# building a query
query_TCGA <- GDCquery(project = 'TCGA-BRCA',
data.category = 'Transcriptome Profiling')
output_query_TCGA <- getResults(query_TCGA)
View(query_TCGA)
View(output_query_TCGA)
#load packages
library(tidyverse)
#read gene tables
gene_tables <-read_table ("data/table_degenes_THCA.txt")
#read gene tables
gene_tables <-read_table ("Thyroid Carcinoma Cancer/data/table_degenes_THCA.txt")
#convert data
gene_tables$GeneSymbol<-as.factor(gene_tables$GeneSymbol)
gene_tables$GeneID<-as.factor(gene_tables$GeneID)
#identify over expressed gene
over_expressed_gene <- gene_tables |>
filter(adjp <0.05 & Log2FoldChange >1) |>
head(20)