d3b-center · AntoniaChroni · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/analyses/tumor-clone-inference/01-process-data-non-synonymous.Rmd b/analyses/tumor-clone-inference/01-process-data-non-synonymous.Rmd
@@ -0,0 +1,360 @@
+---
+title: "Inference of subclonal architecture of tumors across multiple timepoints in the paired longitudinal (PL) cohort"
+author: 'Antonia Chroni <[email protected]> for D3B'
+date: "`r Sys.Date()`"
+output:
+  html_notebook:
+    toc: TRUE
+    toc_float: TRUE
+---
+
+# Background
+
+This notebook is for preparing input files for [FastClone](https://github.com/GuanLab/FastClone_GuanLab).
+
+
+# Set up
+
+```{r load-library}
+suppressPackageStartupMessages({
+  library(tidyverse)
+  library(readxl)
+})
+```
+
+# Directories and File Inputs/Outputs
+
+```{r set-dir-and-file-names}
+# Detect the ".git" folder -- this will be in the project root directory
+# Use this as the root directory to ensure proper sourcing of functions 
+# no matter where this is called from
+root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
+analysis_dir <- file.path(root_dir, "analyses", "tumor-clone-inference")
+input_dir <- file.path(analysis_dir, "input")
+data_dir <- file.path(root_dir, "data")
+files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")
+maf_files_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal", "results")
+
+# Input files
+pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
+maf_file <- file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz")
+tmb_file <- file.path(maf_files_dir, "tmb_vaf_genomic.tsv")
+genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv") # file from add-sample-distribution module
+nautilus_dec_file <- file.path(input_dir, "deceased_samples.xlsx") 
+palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")
+cns_dir <- file.path(input_dir, "cnvkit_data") 
+
+# File path to results directory
+results_dir <-
+  file.path(analysis_dir, "results")
+if (!dir.exists(results_dir)) {
+  dir.create(results_dir)
+}
+
+
+# File path to input directory
+fastclone_input_dir <-
+  file.path(analysis_dir, "results", "fastclone-input-non-synonymous")
+if (!dir.exists(fastclone_input_dir)) {
+  dir.create(fastclone_input_dir)
+}
+
+```
+
+# Load and process data
+
+```{r load-process-inputs}
+# Read in histologies file and filter for the pbta cohort
+pbta <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>%
+  filter(!(experimental_strategy == "RNA-Seq")) %>% 
+  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, tumor_descriptor,
+         sample_id, aliquot_id,
+         tumor_fraction, tumor_ploidy)
+
+# Read in tmb_all file
+tmb_df <- readr::read_tsv(tmb_file, guess_max = 100000, show_col_types = FALSE) 
+
+# Read maf
+maf_file_df <- vroom::vroom(maf_file, delim = "\t", col_names = TRUE)
+
+# Make a vector of coding gene mutations
+maf_nonsynonymous <- c(
+  "Missense_Mutation",
+  "Frame_Shift_Del",
+  "In_Frame_Ins",
+  "Frame_Shift_Ins",
+  "Splice_Site",
+  "Nonsense_Mutation",
+  "In_Frame_Del",
+  "Nonstop_Mutation",
+  "Translation_Start_Site"
+)
+
+# Process maf and keep only synonymous mutations for the current analysis
+maf_df <- maf_file_df %>%
+  filter(#Tumor_Sample_Barcode %in% pbta$Kids_First_Biospecimen_ID,
+         Variant_Classification %in% maf_nonsynonymous
+         ) %>%
+  dplyr::rename(Kids_First_Biospecimen_ID = Tumor_Sample_Barcode) %>% 
+  left_join(pbta) %>%
+  left_join(tmb_df) %>%
+
+  # Calculate VAF 
+  dplyr::mutate(VAF = t_alt_count / (t_ref_count + t_alt_count), 
+                # Concatenate gene and protein information
+                gene_protein = paste(Hugo_Symbol, HGVSp_Short, sep = "_")) %>%  
+  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, tumor_descriptor,
+         sample_id, aliquot_id, Chromosome, Variant_Classification,
+         Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2,
+         t_ref_count, t_alt_count, 
+         tumor_fraction, tumor_ploidy, VAF, gene_protein, Hugo_Symbol,
+         mutation_count, tmb) %>% 
+
+  # Remove hypermutants
+  filter(!tmb >= 10 #, 
+
+  # There are alterations with high number of read counts.
+  # We will exclude those with >1000 for now.
+        # !t_alt_count >= 1000,
+         #!t_ref_count >= 1000
+  )  %>% 
+
+  # Add `normal_cn`: Total copy number of segment in healthy tissue. 
+  # For autosome this will be two and male sex chromosomes one.
+  # See: https://github.com/Roth-Lab/pyclone-vi
+  mutate(normal_cn = case_when(grepl("chrY", Chromosome) ~ "1", 
+                                     TRUE ~ "2")) %>% 
+
+  # Calculate CCF
+  # using formula for CCF calculation
+  mutate(CCF = (2*t_alt_count)/ (t_ref_count + t_alt_count)) #%>% 
+  #write_tsv(file.path(scratch_dir, "maf_pbta.tsv"))
+
+print(table(maf_df$Variant_Classification))
+
+
+# Number of samples in the `maf_df`
+maf_samples <- print(length(unique(maf_df$Kids_First_Participant_ID)))
+
+# Read patient list
+genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>% 
+  select(!c(cancer_group, experimental_strategy)) %>% 
+  left_join(maf_df, by = c("Kids_First_Participant_ID")) %>% # create unique identifiers
+  dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":"),
+                cg_id_kids = paste(cg_id, Kids_First_Participant_ID, sep = "_"),
+                cg_id_kids = str_replace(cg_id_kids, "/|-", "_"),
+                cg_id_kids = str_replace_all(cg_id_kids, " ", "_"),
+                cg_id = str_replace(cg_id, "/|-", "_"),
+                cg_id = str_replace_all(cg_id, " ", "_"),
+                sample_id = paste(tumor_descriptor, Kids_First_Biospecimen_ID, sep = ":")) 
+
+print(table(maf_df$Variant_Classification))
+
+
+# Number of samples in the `genomic_paired_df`
+genomic_samples <- print(length(unique(genomic_paired_df$Kids_First_Participant_ID)))
+
+# Let's count #specimens per sample 
+# We will remove any samples with less than 2 specimens as phylogenies require at least 3 taxa
+kids_specimens_n_df <- genomic_paired_df %>% 
+  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor) %>% 
+  unique() %>% 
+  dplyr::count(Kids_First_Participant_ID) %>% 
+  dplyr::mutate(kids_specimens_n = glue::glue("{Kids_First_Participant_ID}  (N={n})")) %>%
+  dplyr::rename(kids_specimens_number = n) %>% 
+  filter(!kids_specimens_number <= 2) %>%
+
+  left_join(genomic_paired_df, by = c("Kids_First_Participant_ID")) %>%
+  filter(!is.na(Chromosome)) 
+
+
+# Let's confirm and add the number of timepoints per sample
+# In case we lost any from filtering, e.g., hypermutants and high reads/alteration
+timepoints_n_df <- kids_specimens_n_df %>% 
+  select(Kids_First_Participant_ID, tumor_descriptor) %>% 
+  unique() %>% 
+  dplyr::count(Kids_First_Participant_ID) %>% 
+  dplyr::mutate(kids_timepoints_n = glue::glue("{Kids_First_Participant_ID}  (N={n})")) %>%
+  dplyr::rename(kids_timepoints_number = n) %>% 
+  filter(!kids_timepoints_number <= 1)
+
+df <- timepoints_n_df %>% 
+  left_join(kids_specimens_n_df, by = c("Kids_First_Participant_ID")) %>% 
+  write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny_non_synonymous.tsv"))
+
+# Number of samples in the `genomic_paired_df` with > 3 biospecimens and >2 timepoints
+samples_with_3bs <- print(length(unique(df$Kids_First_Participant_ID)))
+samples_with_3bs <- print(unique(df$Kids_First_Participant_ID))
+
+# List with samples eligible for phylogeny
+# I added the information to use for phylogenetic inferences
+list_df <- df %>% 
+  select(Kids_First_Participant_ID) %>% 
+  unique() %>% 
+  mutate(somatic_germline_phylogeny = case_when(grepl("PT_KZ56XHJT|PT_KTRJ8TFY", Kids_First_Participant_ID) ~ "yes",
+                                        TRUE ~ "not_yet")) %>% 
+  write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny_list_non_synonymous.tsv"))
+
+# Remove large files
+rm(genomic_paired_df, kids_specimens_n_df, tmb_df)
+
+```
+
+## Add Nautilus location for Deceased specimens
+
+```{r add-Nautilus-location}
+nautilus_dec_df <- read_excel(nautilus_dec_file) %>% 
+  right_join(df, by = c("sample_id", "aliquot_id", "tumor_descriptor")) %>%
+  dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":")) %>% 
+  write_tsv(file.path(results_dir, "nautilus_dec_non_synonymous.tsv"))
+
+list_df <- nautilus_dec_df %>% 
+  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, `Note field from Nautilus of initial parent`) %>% 
+  unique() %>% 
+  write_tsv(file.path(results_dir, "nautilus_dec_list_non_synonymous.tsv"))
+
+# Make list with samples with >2 biospecimens at Deceased timepoint
+dec_n_df <- df %>% 
+  filter(tumor_descriptor == "Deceased") %>% 
+  select(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID) %>% 
+  unique() %>% 
+  dplyr::count(Kids_First_Participant_ID) %>% 
+ # dplyr::mutate(kids_dec_n = glue::glue("{Kids_First_Participant_ID}  (N={n})")) %>%
+  dplyr::rename(kids_deceased_bs_number = n) %>% 
+  filter(!kids_deceased_bs_number <= 1) %>% 
+  write_tsv(file.path(results_dir, "kids_dec_multiple_bs_list_non_synonymous.tsv"))
+```
+
+
+# Create input files 
+
+We need to generate the input files according to the method's template. 
+Phylogenetic methods require at least 2 samples per tumor site (multiregional sampling per anatomical site).
+Here, we will consider kids samples with more than 2 timepoints with one or more biospecimens. 
+We will compare later differences in samples with single vs multiple biospecimens.
+
+```{r create-all-samples-df}
+# Create df for all samples
+all_samples_df <- nautilus_dec_df %>%
+  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, 
+         cg_id_kids, mutation_id, sample_id,
+         tumor_descriptor, Chromosome, Start_Position, End_Position,
+         Reference_Allele, Tumor_Seq_Allele2, t_ref_count, t_alt_count,
+         normal_cn, tumor_fraction, mutation_count, Hugo_Symbol) %>% 
+
+  # change names to match input requirements
+  dplyr::rename("ref_counts" = "t_ref_count", 
+                "alt_counts" = "t_alt_count",
+                "tumour_content" = "tumor_fraction") 
+
+all_samples_number <- print(unique(all_samples_df$Kids_First_Participant_ID))
+```
+
+## Add major_cn, minor_cn columns from cns files
+
+```{r process-cns-files}
+data_dir <- dir(path = cns_dir,  pattern = ".call.cns", full.names = TRUE, recursive = TRUE)
+
+# Create list 
+data_list <- list()
+
+for (i in 1:length(data_dir) ) { 
+  # Create sample_name
+  sample_name <-  unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
+  sample_name <- sort(sample_name, decreasing = FALSE)
+  print(sample_name)
+
+  for (x in seq_along(sample_name) ) {
+    data_list[[i]] <- read.csv(data_dir[i], header=T, sep="\t") 
+
+    # Create file_name
+    file_name <- gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,12])
+    print(file_name)
+
+    data_list[[i]] <- data_list[[i]] %>% 
+      mutate(Kids_First_Biospecimen_ID = file_name) 
+
+    # The following code assigns name to df
+    # But adds an extra df into the list - dont know yet why but let's remove, it's not necessary
+    #df <- assign(file_name, data_list) 
+    #data_list[[file_name]] <- df
+
+  }
+}
+
+# Bind all df from list 
+data_list_bind <- dplyr::bind_rows(data_list) %>% 
+  as.data.frame()
+
+head(data_list_bind)
+
+# Remove large files
+rm(data_list)
+
+# Create and save pyclone_input!
+input <- data_list_bind %>% 
+
+  # Rename to match input format
+  # To figure out if the assignment is correct
+  dplyr::rename("major_cn" = "cn1", 
+                "minor_cn" = "cn2",
+                #"Start_Position" = "start",
+                #"End_Position" = "end",
+                "Chromosome" = "chromosome"#,
+                #"Hugo_Symbol" = "gene"
+                ) %>% 
+  #select(-c(Kids_First_Biospecimen_ID)) %>% 
+
+  right_join(all_samples_df) %>% 
+
+  filter(!is.na(major_cn),
+         !is.na(minor_cn) #,
+         #!is.na(Kids_First_Participant_ID)
+         ) %>%
+  dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":")) %>% 
+  select(Kids_First_Participant_ID, tumor_descriptor, cg_id_kids, mutation_id, sample_id, ref_counts, 
+         alt_counts, normal_cn, major_cn, minor_cn, tumour_content, mutation_count, Hugo_Symbol, 
+         ) %>% 
+
+  # To ensure there are no duplicated entries in the dataframe
+  unique() 
+
+# Save df with all data together
+# We will need this to prepare the input file for ClonEvol
+input_all <- nautilus_dec_df %>%
+  select(mutation_id, VAF, CCF, gene_protein, Hugo_Symbol) %>% 
+  right_join(input) %>% 
+  write_tsv(file.path(results_dir, "input_all_data_non_synonymous.tsv"))
+
+
+samples_number <- print(unique(input$Kids_First_Participant_ID))
+```
+
+# Save input files
+
+```{r save-input-files}
+for (i in 1:length(data_dir) ) { 
+  # Create sample_name
+  sample_name <-  unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
+  sample_name <- sort(sample_name, decreasing = FALSE)
+  print(sample_name)
+
+  for (x in seq_along(sample_name) ) { 
+
+  ## Save input file for FastClone
+  fastclone_fname <- paste0(fastclone_input_dir, "/", sample_name[x], ".tsv")
+  print(fastclone_fname)
+
+  fastclone_input_subset <- input %>%
+    dplyr::rename("var_counts" = "alt_counts") %>% 
+    filter(Kids_First_Participant_ID == sample_name[x]) %>% 
+    write_tsv(file.path(fastclone_fname))
+
+  }
+}
+```
+
+```{r}
+sessionInfo()
+```
+
diff --git a/analyses/tumor-clone-inference/01-process-data-non-synonymous.nb.html b/analyses/tumor-clone-inference/01-process-data-non-synonymous.nb.html