Skip to content

Commit

Permalink
EGFR in DMG
Browse files Browse the repository at this point in the history
  • Loading branch information
zzgeng committed Jan 16, 2024
1 parent dfe6d9f commit f312057
Show file tree
Hide file tree
Showing 11 changed files with 4,635 additions and 0 deletions.
38 changes: 38 additions & 0 deletions analyses/DMG_analysis/00-EGFR_maf_subset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
## subset maf file
library(tidyverse)

## set directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")

hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))

## subset genes of interests
gene_of_interests <- c("H3-3A", "H3C2", "H3C3", "H3C14",
"EGFR", "TP53", "ATRX", "NF1")

tumor_only_maf <- data.table::fread(
file.path(data_dir, "snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz"),
data.table = FALSE) %>%
filter(Tumor_Sample_Barcode %in% selected_sample$Kids_First_Biospecimen_ID,
Hugo_Symbol %in% gene_of_interests)

# Add tumor only to consensus MAF
snv_consensus_hotspot_maf <- data.table::fread(
file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz"),
data.table = FALSE) %>%
filter(Tumor_Sample_Barcode %in% selected_sample$Kids_First_Biospecimen_ID,
Hugo_Symbol %in% gene_of_interests)

tumor_only_maf$PUBMED <- as.character(tumor_only_maf$PUBMED)
tumor_only_maf$PHENO <- as.character(tumor_only_maf$PHENO)
tumor_only_maf$gnomad_3_1_1_AF_non_cancer <- as.character(tumor_only_maf$gnomad_3_1_1_AF_non_cancer)

snv_final <- snv_consensus_hotspot_maf %>%
dplyr::bind_rows(tumor_only_maf)

write_tsv(snv_final, file.path(subset_dir, "snv_selected_genes.maf.tsv"))
12 changes: 12 additions & 0 deletions analyses/DMG_analysis/00-onco_annotate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

set -e
set -o pipefail

python_path=(/usr/bin/python3)
maf_annotator=(/Users/gengz/Documents/GitHub/oncokb-annotator/MafAnnotator.py)
maf_in=(/Users/gengz/Documents/GitHub/OpenPedCan-analysis/analyses/DMG_analysis/subset/snv_selected_genes.maf.tsv)
maf_oncokb_out=(/Users/gengz/Documents/GitHub/OpenPedCan-analysis/analyses/DMG_analysis/subset/snv_selected_genes_oncokb.maf.tsv)

# Run maf_annotator on dgd samples
$python_path $maf_annotator -i $maf_in -o $maf_oncokb_out -b $ONCO_KB -q hgvsp_short -r GRCh38
74 changes: 74 additions & 0 deletions analyses/DMG_analysis/01-data preparation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
library(tidyverse)
library(reshape2)

## set directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")


## read histologies
hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))

##snv file
snv_annotate <- read_tsv(file.path(subset_dir, "snv_selected_genes_oncokb.maf.tsv"))
snv_annotate_short <- snv_annotate %>%
mutate(MUTATION_EFFECT = case_when(Hugo_Symbol == "EGFR" &
HGVSp_Short %in% c("p.A289V", "p.A289T") ~
paste(MUTATION_EFFECT, "WHO mutation", sep = ";"),
TRUE ~ MUTATION_EFFECT)) %>%
select(Tumor_Sample_Barcode, MUTATION_EFFECT, Hugo_Symbol)

## cn files
cn_df <- read_tsv(file.path(data_dir, "consensus_wgs_plus_cnvkit_wxs_plus_freec_tumor_only.tsv.gz"))
cn_filtered <- cn_df %>%
filter(biospecimen_id %in% selected_sample$Kids_First_Biospecimen_ID,
gene_symbol == "EGFR") %>%
select(biospecimen_id, status, gene_symbol) %>%
dplyr::rename("Tumor_Sample_Barcode" = "biospecimen_id",
"MUTATION_EFFECT" = "status",
"Hugo_Symbol" = "gene_symbol") %>%
mutate(MUTATION_EFFECT = case_when(MUTATION_EFFECT == "amplification" ~ "amplification",
TRUE ~ ""))
rm(cn_df)

## combined cnv with snv
snv_onco <- cn_filtered %>%
bind_rows(snv_annotate_short) %>%
left_join(hist %>% select(Kids_First_Biospecimen_ID, match_id),
by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
select(match_id, MUTATION_EFFECT, Hugo_Symbol) %>%
unique() %>%
group_by(match_id, Hugo_Symbol) %>%
summarise(status = paste0(MUTATION_EFFECT, collapse = ";")) %>%
spread(Hugo_Symbol, status)

## RNA files
RNA <- read_rds(file.path(data_dir, "gene-expression-rsem-tpm-collapsed.rds"))
col_rna <- selected_sample %>% filter(experimental_strategy == "RNA-Seq") %>% select(Kids_First_Biospecimen_ID, match_id)

select_RNA <- RNA[rownames(RNA) %in% c("EZHIP", "EGFR"), col_rna %>% pull(Kids_First_Biospecimen_ID)]
select_RNA <- as.data.frame(scale(t(select_RNA)))
select_RNA$Kids_First_Biospecimen_ID <- rownames(select_RNA)

select_RNA <- select_RNA %>%
mutate(EGFR_exp = case_when(EGFR >=3 ~ "overexpression",
TRUE ~ NA),
EZHIP_exp = case_when(EZHIP >=3 ~ "overexpression",
TRUE ~ NA)) %>%
left_join(selected_sample %>% select(Kids_First_Biospecimen_ID, match_id)) %>%
select(match_id, EGFR_exp, EZHIP_exp) %>%
unique()

## gather all datas into one table
final_df <- selected_sample %>%
select(match_id, age_at_diagnosis_days, reported_gender, molecular_subtype, tumor_descriptor) %>%
unique() %>%
left_join(snv_onco) %>%
left_join(select_RNA) %>%
mutate(reported_gender = case_when(reported_gender == "Not Reported" ~ "Unknown", TRUE ~ reported_gender)) %>%
write_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))

89 changes: 89 additions & 0 deletions analyses/DMG_analysis/02-oncoplot.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
title: "oncoplot_DMG"
author: "ZZ"
date: "2024-01-16"
output: pdf_document
---

## load libraries

```{r libraries}
library(tidyverse)
library(ComplexHeatmap)
library(reshape2)
library(circlize)
```


## set directories and read file
```{r}
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")
onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
```

oncoplot

```{r}
colname_annotate <- c("match_id", "reported_gender", "tumor_descriptor", "molecular_subtype","EGFR_exp", "EZHIP_exp")
annotate_info <- as.data.frame(onco_df[,colname_annotate])
rownames(annotate_info) <- annotate_info$match_id
annotate_info <- annotate_info[,-1]
ha = HeatmapAnnotation(df = annotate_info, col = list(reported_gender = c("Male" = "#0707CF",
"Female" = "#CC0303",
"Unknown" = "white"),
molecular_subtype = c("DMG, H3 K28" = "red",
"DMG, H3 K28, TP53" = "green")))
mat <- as.matrix(onco_df[,-c(2,3,4,5,14,15)])
mat[is.na(mat)] = ""
rownames(mat) = mat[, 1]
mat = mat[, -1]
mat = t(as.matrix(mat))
alter_fun = list(
background = function(x, y, w, h) {
grid.rect(x, y, w, h, gp = gpar(fill = "#ffffff",col= "#ffffff"))
},
`Likely Gain-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#ff4d4d", col = NA))
},
`Likely Loss-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#0D47A1", col = NA))
},
Unknown = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "grey", col = NA))
},
`Loss-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#80bfff", col = NA))
},
Inconclusive = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#8D6E63", col = NA))
},
`Gain-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#9966ff", col = NA))
},
`WHO mutation` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#E69F00", col = NA))
},
`amplification` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#827717", col = NA))
}
)
oncoplot <- oncoPrint(mat, get_type = function(x)strsplit(x, ";")[[1]],
alter_fun = alter_fun, top_annotation = ha)
pdf(file = "oncoplot.pdf", width = 16, height = 8)
draw(oncoplot)
dev.off()
```


78 changes: 78 additions & 0 deletions analyses/DMG_analysis/03-gsva.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
library(GSVA)

## set directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")


## read histologies
hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))
onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))

## RNA
RNA <- read_rds(file.path(data_dir, "gene-expression-rsem-tpm-collapsed.rds"))
col_rna <- selected_sample %>%
filter(experimental_strategy == "RNA-Seq") %>%
select(Kids_First_Biospecimen_ID, match_id)
select_RNA_all <- RNA[, col_rna %>% pull(Kids_First_Biospecimen_ID)]
log_rna <- as.matrix(log2(select_RNA_all + 1))

## Hallmark gene
human_hallmark <- msigdbr::msigdbr(species = "Homo sapiens",
category = "H")

human_hallmark_twocols <- human_hallmark %>%
select(gs_name, gene_symbol)

human_hallmark_list <- base::split(
human_hallmark_twocols$gene_symbol,
list(human_hallmark_twocols$gs_name))

## annotation
col_annotate <- col_rna %>%
left_join(onco_df %>% select(match_id, EGFR)) %>%
mutate(EGFR_mut = case_when(!is.na(EGFR) ~ "EGFR mutated", TRUE ~ "EGFR unmutated")) %>%
select(-match_id)

RNA_EGFR_un <- as.matrix(log_rna[, col_annotate %>% filter(EGFR_mut == "EGFR unmutated") %>% pull(Kids_First_Biospecimen_ID)])
RNA_EGFR <- as.matrix(log_rna[, col_annotate %>% filter(EGFR_mut == "EGFR mutated") %>% pull(Kids_First_Biospecimen_ID)])

## GSVA for all samples
Hallmark_gsea_score_all <- GSVA::gsva(expr = log_rna,
gset.idx.list = human_hallmark_list,
method = "gsva",
min.sz = 1, max.sz = 1500,
parallel.sz = 8,
mx.diff = TRUE)

colAnn <- ComplexHeatmap::HeatmapAnnotation(EGFR_mut = col_annotete$EGFR_mut, which = "col")
hm <- ComplexHeatmap::Heatmap(Hallmark_gsea_score_all,
show_column_names = FALSE, top_annotation = colAnn,
row_names_gp = grid::gpar(fontsize = 5.8),
name = "GSVA score")

pdf("GSVA.pdf", height = 10, width = 15)
ComplexHeatmap::draw(hm)
dev.off()

## GSVA on EGFR mutated and unmutated
Hallmark_gsea_score <- GSVA::gsva(expr = RNA_EGFR,
gset.idx.list = human_hallmark_list,
method = "gsva",
min.sz = 1, max.sz = 1500,
parallel.sz = 8,
mx.diff = TRUE)

Hallmark_gsea_score_un <- GSVA::gsva(expr = RNA_EGFR_un,
gset.idx.list = human_hallmark_list,
method = "gsva",
min.sz = 1, max.sz = 1500,
parallel.sz = 8,
mx.diff = TRUE)

ComplexHeatmap::Heatmap(Hallmark_gsea_score)
ComplexHeatmap::Heatmap(Hallmark_gsea_score_un)
99 changes: 99 additions & 0 deletions analyses/DMG_analysis/04-summary_table.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
---
title: "summary_DMG"
author: "ZZ"
date: "2024-01-15"
output: pdf_document
---

## libraries
```{r}
library(tidyverse)
```

## set directories
```{r}
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")
```

## load files

```{r}
hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))
onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
sv <- data.table::fread(file.path(data_dir, "sv-manta.tsv.gz"))
tp53 <- read_tsv(file.path(root_dir, "analyses", "tp53_nf1_score", "results", "tp53_altered_status.tsv"))
```
### subset files
```{r}
tp53_select <- tp53 %>%
filter(match_id %in% selected_sample$match_id) %>%
select(match_id, tp53_score, tp53_altered) %>%
mutate(tp53_alteration = case_when(tp53_altered %in% c("activated", "loss") ~ "Y", TRUE ~ "N")) %>%
unique()
sv_selected <- sv %>%
filter(Kids.First.Biospecimen.ID.Tumor %in% selected_sample$Kids_First_Biospecimen_ID,
Gene_name == "EGFR") %>%
select(Gene_name, Kids.First.Biospecimen.ID.Tumor, SV_type) %>%
dplyr::rename("Kids_First_Biospecimen_ID" = "Kids.First.Biospecimen.ID.Tumor") %>%
left_join(selected_sample %>% select(Kids_First_Biospecimen_ID, match_id)) %>%
mutate(EGFR_ins = case_when(SV_type == "INS" ~ "Y",
TRUE ~ "N")) %>%
select(match_id, EGFR_ins, SV_type) %>%
unique()
rm(sv)
```

### transform onco file

```{r}
snv_annotate <- read_tsv(file.path(subset_dir, "snv_selected_genes_oncokb.maf.tsv"))
snv_annotate_1 <- snv_annotate %>%
left_join(selected_sample %>% select(match_id, Kids_First_Biospecimen_ID),
by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
mutate(H3.1_K28M = case_when(Hugo_Symbol == "H3-3A" & HGVSp_Short == "p.K28M" ~ "Y", TRUE ~ "N"),
H3.3_K28M = case_when(Hugo_Symbol %in% c("H3C2", "H3C3") & HGVSp_Short == "p.K28M" ~ "Y", TRUE ~ "N"),
EGFR_ins = case_when(grepl("Ins", Variant_Classification) & Hugo_Symbol == "EGFR" ~ "Y", TRUE ~ "N"),
EGFR_onco = case_when(Hugo_Symbol == "EGFR" & ONCOGENIC == "Oncogenic" ~ "Y", TRUE ~ "N"),
EGFR_WHO_reported = case_when(Hugo_Symbol == "EGFR" & HGVSp_Short %in% c("p.A289T", "p.A289V") ~ "Y", TRUE ~ "N"),
ATRX_onco = case_when(Hugo_Symbol == "ATRX" & ONCOGENIC == "Oncogenic" ~ "Y", TRUE ~ "N")) %>%
select(match_id, H3.1_K28M, H3.3_K28M, EGFR_ins, EGFR_onco, EGFR_WHO_reported, ATRX_onco) %>%
filter(rowSums(is.na(.)) != 6) %>%
group_by(match_id) %>%
summarize_at(vars(H3.1_K28M, H3.3_K28M, EGFR_ins, EGFR_onco, EGFR_WHO_reported, ATRX_onco),
funs(sum = paste(unique(.), collapse = ";"))) %>%
unique()
```

```{r}
summary_df <- onco_df %>%
mutate(H3_WT = case_when(is.na(`H3-3A`) & is.na(H3C14) & is.na(H3C2) & is.na(H3C3) ~ "Y", TRUE ~ "N"),
EGFR_overexp = case_when(EGFR_exp == "overexpression" ~ "Y", TRUE ~ "N"),
EZHIP_overexp = case_when(EZHIP_exp == "overexpression" ~ "Y", TRUE ~ "N"),
EGFR_amp = case_when(grepl("amplification", EGFR) ~ "Y", TRUE ~ "N")) %>%
select(match_id, H3_WT, EGFR_overexp, EZHIP_overexp, EGFR_amp) %>%
left_join(tp53_select) %>%
left_join(snv_annotate_1) %>%
left_join(sv_selected) %>%
mutate(H3.1_K28M_sum = case_when(H3.1_K28M_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ H3.1_K28M_sum),
H3.3_K28M_sum = case_when(H3.3_K28M_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ H3.3_K28M_sum),
EGFR_ins_sum = case_when(EGFR_ins_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_ins_sum),
EGFR_onco_sum = case_when(EGFR_onco_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_onco_sum),
EGFR_WHO_reported_sum = case_when(EGFR_WHO_reported_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_WHO_reported_sum))
write_tsv(summary_df, file.path(result_dir, "summary_table.tsv"))
```
Loading

0 comments on commit f312057

Please sign in to comment.