Skip to content
This repository has been archived by the owner on Oct 2, 2024. It is now read-only.

Create input files for TCI analysis - V2 #67

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
360 changes: 360 additions & 0 deletions analyses/tumor-clone-inference/01-process-data-non-synonymous.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,360 @@
---
title: "Inference of subclonal architecture of tumors across multiple timepoints in the paired longitudinal (PL) cohort"
author: 'Antonia Chroni <[email protected]> for D3B'
date: "`r Sys.Date()`"
output:
html_notebook:
toc: TRUE
toc_float: TRUE
---

# Background

This notebook is for preparing input files for [FastClone](https://github.com/GuanLab/FastClone_GuanLab).


# Set up

```{r load-library}
suppressPackageStartupMessages({
library(tidyverse)
library(readxl)
})
```

# Directories and File Inputs/Outputs

```{r set-dir-and-file-names}
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tumor-clone-inference")
input_dir <- file.path(analysis_dir, "input")
data_dir <- file.path(root_dir, "data")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")
maf_files_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
maf_file <- file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz")
tmb_file <- file.path(maf_files_dir, "tmb_vaf_genomic.tsv")
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv") # file from add-sample-distribution module
nautilus_dec_file <- file.path(input_dir, "deceased_samples.xlsx")
palette_file <- file.path(root_dir, "figures", "palettes", "tumor_descriptor_color_palette.tsv")
cns_dir <- file.path(input_dir, "cnvkit_data")

# File path to results directory
results_dir <-
file.path(analysis_dir, "results")
if (!dir.exists(results_dir)) {
dir.create(results_dir)
}


# File path to input directory
fastclone_input_dir <-
file.path(analysis_dir, "results", "fastclone-input-non-synonymous")
if (!dir.exists(fastclone_input_dir)) {
dir.create(fastclone_input_dir)
}

```

# Load and process data

```{r load-process-inputs}
# Read in histologies file and filter for the pbta cohort
pbta <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>%
filter(!(experimental_strategy == "RNA-Seq")) %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, tumor_descriptor,
sample_id, aliquot_id,
tumor_fraction, tumor_ploidy)

# Read in tmb_all file
tmb_df <- readr::read_tsv(tmb_file, guess_max = 100000, show_col_types = FALSE)

# Read maf
maf_file_df <- vroom::vroom(maf_file, delim = "\t", col_names = TRUE)

# Make a vector of coding gene mutations
maf_nonsynonymous <- c(
"Missense_Mutation",
"Frame_Shift_Del",
"In_Frame_Ins",
"Frame_Shift_Ins",
"Splice_Site",
"Nonsense_Mutation",
"In_Frame_Del",
"Nonstop_Mutation",
"Translation_Start_Site"
)

# Process maf and keep only synonymous mutations for the current analysis
maf_df <- maf_file_df %>%
filter(#Tumor_Sample_Barcode %in% pbta$Kids_First_Biospecimen_ID,
Variant_Classification %in% maf_nonsynonymous
) %>%
dplyr::rename(Kids_First_Biospecimen_ID = Tumor_Sample_Barcode) %>%
left_join(pbta) %>%
left_join(tmb_df) %>%

# Calculate VAF
dplyr::mutate(VAF = t_alt_count / (t_ref_count + t_alt_count),
# Concatenate gene and protein information
gene_protein = paste(Hugo_Symbol, HGVSp_Short, sep = "_")) %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id, tumor_descriptor,
sample_id, aliquot_id, Chromosome, Variant_Classification,
Start_Position, End_Position, Reference_Allele, Tumor_Seq_Allele1, Tumor_Seq_Allele2,
t_ref_count, t_alt_count,
tumor_fraction, tumor_ploidy, VAF, gene_protein, Hugo_Symbol,
mutation_count, tmb) %>%

# Remove hypermutants
filter(!tmb >= 10 #,

# There are alterations with high number of read counts.
# We will exclude those with >1000 for now.
# !t_alt_count >= 1000,
#!t_ref_count >= 1000
) %>%

# Add `normal_cn`: Total copy number of segment in healthy tissue.
# For autosome this will be two and male sex chromosomes one.
# See: https://github.com/Roth-Lab/pyclone-vi
mutate(normal_cn = case_when(grepl("chrY", Chromosome) ~ "1",
TRUE ~ "2")) %>%

# Calculate CCF
# using formula for CCF calculation
mutate(CCF = (2*t_alt_count)/ (t_ref_count + t_alt_count)) #%>%
#write_tsv(file.path(scratch_dir, "maf_pbta.tsv"))

print(table(maf_df$Variant_Classification))


# Number of samples in the `maf_df`
maf_samples <- print(length(unique(maf_df$Kids_First_Participant_ID)))

# Read patient list
genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
select(!c(cancer_group, experimental_strategy)) %>%
left_join(maf_df, by = c("Kids_First_Participant_ID")) %>% # create unique identifiers
dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":"),
cg_id_kids = paste(cg_id, Kids_First_Participant_ID, sep = "_"),
cg_id_kids = str_replace(cg_id_kids, "/|-", "_"),
cg_id_kids = str_replace_all(cg_id_kids, " ", "_"),
cg_id = str_replace(cg_id, "/|-", "_"),
cg_id = str_replace_all(cg_id, " ", "_"),
sample_id = paste(tumor_descriptor, Kids_First_Biospecimen_ID, sep = ":"))

print(table(maf_df$Variant_Classification))


# Number of samples in the `genomic_paired_df`
genomic_samples <- print(length(unique(genomic_paired_df$Kids_First_Participant_ID)))

# Let's count #specimens per sample
# We will remove any samples with less than 2 specimens as phylogenies require at least 3 taxa
kids_specimens_n_df <- genomic_paired_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, tumor_descriptor) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
dplyr::mutate(kids_specimens_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_specimens_number = n) %>%
filter(!kids_specimens_number <= 2) %>%

left_join(genomic_paired_df, by = c("Kids_First_Participant_ID")) %>%
filter(!is.na(Chromosome))


# Let's confirm and add the number of timepoints per sample
# In case we lost any from filtering, e.g., hypermutants and high reads/alteration
timepoints_n_df <- kids_specimens_n_df %>%
select(Kids_First_Participant_ID, tumor_descriptor) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
dplyr::mutate(kids_timepoints_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_timepoints_number = n) %>%
filter(!kids_timepoints_number <= 1)

df <- timepoints_n_df %>%
left_join(kids_specimens_n_df, by = c("Kids_First_Participant_ID")) %>%
write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny_non_synonymous.tsv"))

# Number of samples in the `genomic_paired_df` with > 3 biospecimens and >2 timepoints
samples_with_3bs <- print(length(unique(df$Kids_First_Participant_ID)))
samples_with_3bs <- print(unique(df$Kids_First_Participant_ID))

# List with samples eligible for phylogeny
# I added the information to use for phylogenetic inferences
list_df <- df %>%
select(Kids_First_Participant_ID) %>%
unique() %>%
mutate(somatic_germline_phylogeny = case_when(grepl("PT_KZ56XHJT|PT_KTRJ8TFY", Kids_First_Participant_ID) ~ "yes",
TRUE ~ "not_yet")) %>%
write_tsv(file.path(results_dir, "samples_eligible_for_phylogeny_list_non_synonymous.tsv"))

# Remove large files
rm(genomic_paired_df, kids_specimens_n_df, tmb_df)

```

## Add Nautilus location for Deceased specimens

```{r add-Nautilus-location}
nautilus_dec_df <- read_excel(nautilus_dec_file) %>%
right_join(df, by = c("sample_id", "aliquot_id", "tumor_descriptor")) %>%
dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":")) %>%
write_tsv(file.path(results_dir, "nautilus_dec_non_synonymous.tsv"))

list_df <- nautilus_dec_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, `Note field from Nautilus of initial parent`) %>%
unique() %>%
write_tsv(file.path(results_dir, "nautilus_dec_list_non_synonymous.tsv"))

# Make list with samples with >2 biospecimens at Deceased timepoint
dec_n_df <- df %>%
filter(tumor_descriptor == "Deceased") %>%
select(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID) %>%
unique() %>%
dplyr::count(Kids_First_Participant_ID) %>%
# dplyr::mutate(kids_dec_n = glue::glue("{Kids_First_Participant_ID} (N={n})")) %>%
dplyr::rename(kids_deceased_bs_number = n) %>%
filter(!kids_deceased_bs_number <= 1) %>%
write_tsv(file.path(results_dir, "kids_dec_multiple_bs_list_non_synonymous.tsv"))
```


# Create input files

We need to generate the input files according to the method's template.
Phylogenetic methods require at least 2 samples per tumor site (multiregional sampling per anatomical site).
Here, we will consider kids samples with more than 2 timepoints with one or more biospecimens.
We will compare later differences in samples with single vs multiple biospecimens.

```{r create-all-samples-df}
# Create df for all samples
all_samples_df <- nautilus_dec_df %>%
select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_id,
cg_id_kids, mutation_id, sample_id,
tumor_descriptor, Chromosome, Start_Position, End_Position,
Reference_Allele, Tumor_Seq_Allele2, t_ref_count, t_alt_count,
normal_cn, tumor_fraction, mutation_count, Hugo_Symbol) %>%

# change names to match input requirements
dplyr::rename("ref_counts" = "t_ref_count",
"alt_counts" = "t_alt_count",
"tumour_content" = "tumor_fraction")

all_samples_number <- print(unique(all_samples_df$Kids_First_Participant_ID))
```

## Add major_cn, minor_cn columns from cns files

```{r process-cns-files}
data_dir <- dir(path = cns_dir, pattern = ".call.cns", full.names = TRUE, recursive = TRUE)

# Create list
data_list <- list()

for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)

for (x in seq_along(sample_name) ) {
data_list[[i]] <- read.csv(data_dir[i], header=T, sep="\t")

# Create file_name
file_name <- gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,12])
print(file_name)

data_list[[i]] <- data_list[[i]] %>%
mutate(Kids_First_Biospecimen_ID = file_name)

# The following code assigns name to df
# But adds an extra df into the list - dont know yet why but let's remove, it's not necessary
#df <- assign(file_name, data_list)
#data_list[[file_name]] <- df

}
}

# Bind all df from list
data_list_bind <- dplyr::bind_rows(data_list) %>%
as.data.frame()

head(data_list_bind)

# Remove large files
rm(data_list)

# Create and save pyclone_input!
input <- data_list_bind %>%

# Rename to match input format
# To figure out if the assignment is correct
dplyr::rename("major_cn" = "cn1",
"minor_cn" = "cn2",
#"Start_Position" = "start",
#"End_Position" = "end",
"Chromosome" = "chromosome"#,
#"Hugo_Symbol" = "gene"
) %>%
#select(-c(Kids_First_Biospecimen_ID)) %>%

right_join(all_samples_df) %>%

filter(!is.na(major_cn),
!is.na(minor_cn) #,
#!is.na(Kids_First_Participant_ID)
) %>%
dplyr::mutate(mutation_id = paste(Kids_First_Participant_ID, tumor_descriptor, Kids_First_Biospecimen_ID, Chromosome, Start_Position, Reference_Allele, Tumor_Seq_Allele2, sep = ":")) %>%
select(Kids_First_Participant_ID, tumor_descriptor, cg_id_kids, mutation_id, sample_id, ref_counts,
alt_counts, normal_cn, major_cn, minor_cn, tumour_content, mutation_count, Hugo_Symbol,
) %>%

# To ensure there are no duplicated entries in the dataframe
unique()

# Save df with all data together
# We will need this to prepare the input file for ClonEvol
input_all <- nautilus_dec_df %>%
select(mutation_id, VAF, CCF, gene_protein, Hugo_Symbol) %>%
right_join(input) %>%
write_tsv(file.path(results_dir, "input_all_data_non_synonymous.tsv"))


samples_number <- print(unique(input$Kids_First_Participant_ID))
```

# Save input files

```{r save-input-files}
for (i in 1:length(data_dir) ) {
# Create sample_name
sample_name <- unique(as.character(gsub(".call.cns", "", str_split_fixed(data_dir[i], "/", 13)[,11])))
sample_name <- sort(sample_name, decreasing = FALSE)
print(sample_name)

for (x in seq_along(sample_name) ) {

## Save input file for FastClone
fastclone_fname <- paste0(fastclone_input_dir, "/", sample_name[x], ".tsv")
print(fastclone_fname)

fastclone_input_subset <- input %>%
dplyr::rename("var_counts" = "alt_counts") %>%
filter(Kids_First_Participant_ID == sample_name[x]) %>%
write_tsv(file.path(fastclone_fname))

}
}
```

```{r}
sessionInfo()
```

3,337 changes: 3,337 additions & 0 deletions analyses/tumor-clone-inference/01-process-data-non-synonymous.nb.html

Large diffs are not rendered by default.

Loading