EGFR in DMG

d3b-center · Jan 16, 2024 · f312057 · f312057
1 parent dfe6d9f
commit f312057
Show file tree

Hide file tree

Showing 11 changed files with 4,635 additions and 0 deletions.
diff --git a/analyses/DMG_analysis/00-EGFR_maf_subset.R b/analyses/DMG_analysis/00-EGFR_maf_subset.R
@@ -0,0 +1,38 @@
+## subset maf file
+library(tidyverse)
+
+## set directories
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+data_dir <- file.path(root_dir, "data", "v13")
+subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
+result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")
+
+hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
+selected_sample <- hist %>% 
+  filter(grepl(c("DMG"), molecular_subtype))
+
+## subset genes of interests
+gene_of_interests <- c("H3-3A", "H3C2", "H3C3", "H3C14", 
+                       "EGFR", "TP53", "ATRX", "NF1")
+
+tumor_only_maf <- data.table::fread(
+  file.path(data_dir, "snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz"),
+  data.table = FALSE) %>% 
+  filter(Tumor_Sample_Barcode %in% selected_sample$Kids_First_Biospecimen_ID, 
+         Hugo_Symbol %in% gene_of_interests)
+
+# Add tumor only to consensus MAF
+snv_consensus_hotspot_maf <- data.table::fread(
+  file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz"),
+  data.table = FALSE) %>%
+  filter(Tumor_Sample_Barcode %in% selected_sample$Kids_First_Biospecimen_ID, 
+         Hugo_Symbol %in% gene_of_interests) 
+
+tumor_only_maf$PUBMED <- as.character(tumor_only_maf$PUBMED)
+tumor_only_maf$PHENO <- as.character(tumor_only_maf$PHENO)
+tumor_only_maf$gnomad_3_1_1_AF_non_cancer <- as.character(tumor_only_maf$gnomad_3_1_1_AF_non_cancer)
+
+snv_final <- snv_consensus_hotspot_maf %>%  
+  dplyr::bind_rows(tumor_only_maf) 
+
+write_tsv(snv_final, file.path(subset_dir, "snv_selected_genes.maf.tsv"))
diff --git a/analyses/DMG_analysis/00-onco_annotate.sh b/analyses/DMG_analysis/00-onco_annotate.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+set -e
+set -o pipefail
+
+python_path=(/usr/bin/python3)
+maf_annotator=(/Users/gengz/Documents/GitHub/oncokb-annotator/MafAnnotator.py)
+maf_in=(/Users/gengz/Documents/GitHub/OpenPedCan-analysis/analyses/DMG_analysis/subset/snv_selected_genes.maf.tsv)
+maf_oncokb_out=(/Users/gengz/Documents/GitHub/OpenPedCan-analysis/analyses/DMG_analysis/subset/snv_selected_genes_oncokb.maf.tsv)
+
+# Run maf_annotator on dgd samples
+$python_path $maf_annotator -i $maf_in -o $maf_oncokb_out -b $ONCO_KB -q hgvsp_short -r GRCh38
diff --git a/analyses/DMG_analysis/01-data preparation.R b/analyses/DMG_analysis/01-data preparation.R
@@ -0,0 +1,74 @@
+library(tidyverse)
+library(reshape2)
+
+## set directories
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+data_dir <- file.path(root_dir, "data", "v13")
+subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
+result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")
+
+
+## read histologies
+hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
+selected_sample <- hist %>% 
+  filter(grepl(c("DMG"), molecular_subtype))
+
+##snv file
+snv_annotate <- read_tsv(file.path(subset_dir, "snv_selected_genes_oncokb.maf.tsv"))
+snv_annotate_short <- snv_annotate %>% 
+  mutate(MUTATION_EFFECT = case_when(Hugo_Symbol == "EGFR" & 
+                                       HGVSp_Short %in% c("p.A289V", "p.A289T") ~ 
+                                       paste(MUTATION_EFFECT, "WHO mutation", sep = ";"), 
+                                     TRUE ~ MUTATION_EFFECT)) %>%
+  select(Tumor_Sample_Barcode, MUTATION_EFFECT, Hugo_Symbol)
+
+## cn files
+cn_df <- read_tsv(file.path(data_dir, "consensus_wgs_plus_cnvkit_wxs_plus_freec_tumor_only.tsv.gz"))
+cn_filtered <- cn_df %>% 
+  filter(biospecimen_id %in% selected_sample$Kids_First_Biospecimen_ID,
+         gene_symbol == "EGFR") %>% 
+  select(biospecimen_id, status, gene_symbol) %>%
+  dplyr::rename("Tumor_Sample_Barcode" = "biospecimen_id", 
+                "MUTATION_EFFECT" = "status", 
+                "Hugo_Symbol" = "gene_symbol") %>%
+  mutate(MUTATION_EFFECT = case_when(MUTATION_EFFECT == "amplification" ~ "amplification", 
+                                     TRUE ~ "")) 
+rm(cn_df)
+
+## combined cnv with snv
+snv_onco <- cn_filtered %>%
+  bind_rows(snv_annotate_short) %>%
+  left_join(hist %>% select(Kids_First_Biospecimen_ID, match_id), 
+            by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
+  select(match_id, MUTATION_EFFECT, Hugo_Symbol) %>%
+  unique() %>%
+  group_by(match_id, Hugo_Symbol) %>% 
+  summarise(status = paste0(MUTATION_EFFECT, collapse = ";")) %>% 
+  spread(Hugo_Symbol, status)
+
+## RNA files
+RNA <- read_rds(file.path(data_dir, "gene-expression-rsem-tpm-collapsed.rds"))
+col_rna <- selected_sample %>% filter(experimental_strategy == "RNA-Seq") %>% select(Kids_First_Biospecimen_ID, match_id)
+
+select_RNA <- RNA[rownames(RNA) %in% c("EZHIP", "EGFR"), col_rna %>% pull(Kids_First_Biospecimen_ID)]
+select_RNA <- as.data.frame(scale(t(select_RNA)))
+select_RNA$Kids_First_Biospecimen_ID <- rownames(select_RNA)
+
+select_RNA <- select_RNA %>% 
+  mutate(EGFR_exp = case_when(EGFR >=3 ~ "overexpression", 
+                              TRUE ~ NA), 
+         EZHIP_exp = case_when(EZHIP >=3 ~ "overexpression", 
+                               TRUE ~ NA)) %>% 
+  left_join(selected_sample %>% select(Kids_First_Biospecimen_ID, match_id)) %>% 
+  select(match_id, EGFR_exp, EZHIP_exp) %>% 
+  unique()
+
+## gather all datas into one table
+final_df <- selected_sample %>% 
+  select(match_id, age_at_diagnosis_days, reported_gender, molecular_subtype, tumor_descriptor) %>%
+  unique() %>%
+  left_join(snv_onco) %>% 
+  left_join(select_RNA) %>%
+  mutate(reported_gender = case_when(reported_gender == "Not Reported" ~ "Unknown", TRUE ~ reported_gender)) %>% 
+  write_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
+
diff --git a/analyses/DMG_analysis/02-oncoplot.Rmd b/analyses/DMG_analysis/02-oncoplot.Rmd
@@ -0,0 +1,89 @@
+---
+title: "oncoplot_DMG"
+author: "ZZ"
+date: "2024-01-16"
+output: pdf_document
+---
+
+## load libraries
+
+```{r libraries}
+library(tidyverse)
+library(ComplexHeatmap)
+library(reshape2)
+library(circlize)
+```
+
+
+## set directories and read file
+```{r}
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
+result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")
+
+onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
+```
+
+oncoplot
+
+```{r}
+
+colname_annotate <- c("match_id", "reported_gender", "tumor_descriptor", "molecular_subtype","EGFR_exp", "EZHIP_exp")
+
+annotate_info <- as.data.frame(onco_df[,colname_annotate])
+rownames(annotate_info) <- annotate_info$match_id
+annotate_info <- annotate_info[,-1]
+
+ha = HeatmapAnnotation(df = annotate_info, col = list(reported_gender = c("Male" = "#0707CF",
+                                                      "Female" = "#CC0303",
+                                                      "Unknown" = "white"),
+                                                      molecular_subtype = c("DMG, H3 K28" = "red", 
+                                                                            "DMG, H3 K28, TP53" = "green")))
+
+mat <- as.matrix(onco_df[,-c(2,3,4,5,14,15)])
+mat[is.na(mat)] = ""
+rownames(mat) = mat[, 1]
+mat = mat[, -1]
+mat = t(as.matrix(mat))
+
+alter_fun = list(
+  background = function(x, y, w, h) {
+    grid.rect(x, y, w, h, gp = gpar(fill = "#ffffff",col= "#ffffff"))
+  },
+  `Likely Gain-of-function` = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#ff4d4d", col = NA))
+  },
+  `Likely Loss-of-function` = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#0D47A1", col = NA))
+  },
+  Unknown = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "grey", col = NA))
+  },
+  `Loss-of-function` = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#80bfff", col = NA))
+  },
+  Inconclusive = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#8D6E63", col = NA))
+  },
+  `Gain-of-function` = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#9966ff", col = NA))
+  },
+  `WHO mutation` = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#E69F00", col = NA))
+  },
+  `amplification` = function(x, y, w, h) {
+    grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#827717", col = NA))
+  }
+)
+
+oncoplot <- oncoPrint(mat, get_type = function(x)strsplit(x, ";")[[1]], 
+          alter_fun = alter_fun, top_annotation = ha)
+
+pdf(file = "oncoplot.pdf", width = 16, height = 8) 
+draw(oncoplot)
+dev.off()
+
+
+```
+
+
diff --git a/analyses/DMG_analysis/03-gsva.R b/analyses/DMG_analysis/03-gsva.R
@@ -0,0 +1,78 @@
+library(GSVA)
+
+## set directories
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+data_dir <- file.path(root_dir, "data", "v13")
+subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
+result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")
+
+
+## read histologies
+hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
+selected_sample <- hist %>% 
+  filter(grepl(c("DMG"), molecular_subtype))
+onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
+
+## RNA
+RNA <- read_rds(file.path(data_dir, "gene-expression-rsem-tpm-collapsed.rds"))
+col_rna <- selected_sample %>% 
+  filter(experimental_strategy == "RNA-Seq") %>% 
+  select(Kids_First_Biospecimen_ID, match_id)
+select_RNA_all <- RNA[, col_rna %>% pull(Kids_First_Biospecimen_ID)]
+log_rna <- as.matrix(log2(select_RNA_all + 1))
+
+## Hallmark gene
+human_hallmark <- msigdbr::msigdbr(species = "Homo sapiens", 
+                                   category = "H")
+
+human_hallmark_twocols <- human_hallmark %>% 
+  select(gs_name, gene_symbol)
+
+human_hallmark_list <- base::split(
+  human_hallmark_twocols$gene_symbol,
+  list(human_hallmark_twocols$gs_name))
+
+## annotation 
+col_annotate <- col_rna %>% 
+  left_join(onco_df %>% select(match_id, EGFR)) %>% 
+  mutate(EGFR_mut = case_when(!is.na(EGFR) ~ "EGFR mutated", TRUE ~ "EGFR unmutated")) %>% 
+  select(-match_id)
+
+RNA_EGFR_un <- as.matrix(log_rna[, col_annotate %>% filter(EGFR_mut == "EGFR unmutated") %>% pull(Kids_First_Biospecimen_ID)])
+RNA_EGFR <- as.matrix(log_rna[, col_annotate %>% filter(EGFR_mut == "EGFR mutated") %>% pull(Kids_First_Biospecimen_ID)])
+
+## GSVA for all samples
+Hallmark_gsea_score_all <- GSVA::gsva(expr = log_rna,
+                                      gset.idx.list = human_hallmark_list,
+                                      method = "gsva", 
+                                      min.sz = 1, max.sz = 1500,
+                                      parallel.sz = 8, 
+                                      mx.diff = TRUE) 
+
+colAnn <- ComplexHeatmap::HeatmapAnnotation(EGFR_mut = col_annotete$EGFR_mut, which = "col")
+hm <- ComplexHeatmap::Heatmap(Hallmark_gsea_score_all, 
+                              show_column_names = FALSE, top_annotation = colAnn, 
+                              row_names_gp = grid::gpar(fontsize = 5.8),
+                              name = "GSVA score")
+
+pdf("GSVA.pdf", height = 10, width = 15)
+ComplexHeatmap::draw(hm)
+dev.off()
+
+## GSVA on EGFR mutated and unmutated
+Hallmark_gsea_score <- GSVA::gsva(expr = RNA_EGFR,
+                                  gset.idx.list = human_hallmark_list,
+                                  method = "gsva", 
+                                  min.sz = 1, max.sz = 1500,
+                                  parallel.sz = 8, 
+                                  mx.diff = TRUE) 
+
+Hallmark_gsea_score_un <- GSVA::gsva(expr = RNA_EGFR_un,
+                                     gset.idx.list = human_hallmark_list,
+                                     method = "gsva", 
+                                     min.sz = 1, max.sz = 1500,
+                                     parallel.sz = 8, 
+                                     mx.diff = TRUE) 
+
+ComplexHeatmap::Heatmap(Hallmark_gsea_score)
+ComplexHeatmap::Heatmap(Hallmark_gsea_score_un)
diff --git a/analyses/DMG_analysis/04-summary_table.Rmd b/analyses/DMG_analysis/04-summary_table.Rmd
@@ -0,0 +1,99 @@
+---
+title: "summary_DMG"
+author: "ZZ"
+date: "2024-01-15"
+output: pdf_document
+---
+
+## libraries
+```{r}
+library(tidyverse)
+
+```
+
+## set directories 
+```{r}
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+data_dir <- file.path(root_dir, "data", "v13")
+subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
+result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")
+
+```
+
+## load files
+
+```{r}
+hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
+selected_sample <- hist %>% 
+  filter(grepl(c("DMG"), molecular_subtype))
+onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
+sv <- data.table::fread(file.path(data_dir, "sv-manta.tsv.gz"))
+tp53 <- read_tsv(file.path(root_dir, "analyses", "tp53_nf1_score", "results", "tp53_altered_status.tsv"))
+
+```
+### subset files
+```{r}
+tp53_select <- tp53 %>% 
+  filter(match_id %in% selected_sample$match_id) %>% 
+  select(match_id, tp53_score, tp53_altered) %>% 
+  mutate(tp53_alteration = case_when(tp53_altered %in% c("activated", "loss") ~ "Y", TRUE ~ "N")) %>% 
+  unique()
+
+sv_selected <- sv %>% 
+  filter(Kids.First.Biospecimen.ID.Tumor %in% selected_sample$Kids_First_Biospecimen_ID, 
+         Gene_name == "EGFR") %>% 
+  select(Gene_name, Kids.First.Biospecimen.ID.Tumor, SV_type) %>% 
+  dplyr::rename("Kids_First_Biospecimen_ID" = "Kids.First.Biospecimen.ID.Tumor") %>% 
+  left_join(selected_sample %>% select(Kids_First_Biospecimen_ID, match_id)) %>% 
+  mutate(EGFR_ins = case_when(SV_type == "INS" ~ "Y", 
+                              TRUE ~ "N")) %>% 
+  select(match_id, EGFR_ins, SV_type) %>% 
+  unique()
+
+rm(sv)
+```
+
+### transform onco file
+
+```{r}
+
+snv_annotate <- read_tsv(file.path(subset_dir, "snv_selected_genes_oncokb.maf.tsv"))
+snv_annotate_1 <- snv_annotate %>% 
+  left_join(selected_sample %>% select(match_id, Kids_First_Biospecimen_ID), 
+            by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
+  mutate(H3.1_K28M = case_when(Hugo_Symbol == "H3-3A" & HGVSp_Short == "p.K28M" ~ "Y", TRUE ~ "N"), 
+         H3.3_K28M = case_when(Hugo_Symbol %in% c("H3C2", "H3C3") & HGVSp_Short == "p.K28M" ~ "Y", TRUE ~ "N"),
+         EGFR_ins = case_when(grepl("Ins", Variant_Classification) & Hugo_Symbol == "EGFR" ~ "Y", TRUE ~ "N"), 
+         EGFR_onco = case_when(Hugo_Symbol == "EGFR" & ONCOGENIC == "Oncogenic" ~ "Y", TRUE ~ "N"),
+         EGFR_WHO_reported = case_when(Hugo_Symbol == "EGFR" & HGVSp_Short %in% c("p.A289T", "p.A289V") ~ "Y", TRUE ~ "N"),
+         ATRX_onco = case_when(Hugo_Symbol == "ATRX" & ONCOGENIC == "Oncogenic" ~ "Y", TRUE ~ "N")) %>% 
+  select(match_id, H3.1_K28M, H3.3_K28M, EGFR_ins, EGFR_onco, EGFR_WHO_reported, ATRX_onco) %>% 
+  filter(rowSums(is.na(.)) != 6) %>%
+  group_by(match_id) %>% 
+  summarize_at(vars(H3.1_K28M, H3.3_K28M, EGFR_ins, EGFR_onco, EGFR_WHO_reported, ATRX_onco), 
+               funs(sum = paste(unique(.), collapse = ";"))) %>%
+  unique() 
+  
+
+
+```
+
+```{r}
+
+summary_df <- onco_df %>% 
+  mutate(H3_WT = case_when(is.na(`H3-3A`) & is.na(H3C14) & is.na(H3C2) & is.na(H3C3) ~ "Y", TRUE ~ "N"), 
+         EGFR_overexp = case_when(EGFR_exp == "overexpression" ~ "Y", TRUE ~ "N"), 
+         EZHIP_overexp = case_when(EZHIP_exp == "overexpression" ~ "Y", TRUE ~ "N"), 
+         EGFR_amp = case_when(grepl("amplification", EGFR) ~ "Y", TRUE ~ "N")) %>% 
+  select(match_id, H3_WT, EGFR_overexp, EZHIP_overexp, EGFR_amp) %>%
+  left_join(tp53_select) %>% 
+  left_join(snv_annotate_1) %>% 
+  left_join(sv_selected) %>% 
+  mutate(H3.1_K28M_sum = case_when(H3.1_K28M_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ H3.1_K28M_sum), 
+         H3.3_K28M_sum = case_when(H3.3_K28M_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ H3.3_K28M_sum), 
+         EGFR_ins_sum = case_when(EGFR_ins_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_ins_sum),
+         EGFR_onco_sum = case_when(EGFR_onco_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_onco_sum),
+         EGFR_WHO_reported_sum = case_when(EGFR_WHO_reported_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_WHO_reported_sum))
+
+write_tsv(summary_df, file.path(result_dir, "summary_table.tsv"))
+```