From 3b5a5364382b10841919b0730a69874973dc437f Mon Sep 17 00:00:00 2001
From: Peter Ruzanov <pruzanov@gmail.com>
Date: Fri, 13 Mar 2020 13:19:54 -0400
Subject: [PATCH] Adding .R scripts for wdl workflow

---
 Generate_plody_priors_table.R | 100 ++++++++++++++++++++++++++++++++++
 SequenzaPreProcess_v2.2.R     |  49 +++++++++++++++++
 SequenzaProcess_v2.2.R        |  62 +++++++++++++++++++++
 3 files changed, 211 insertions(+)
 create mode 100644 Generate_plody_priors_table.R
 create mode 100755 SequenzaPreProcess_v2.2.R
 create mode 100755 SequenzaProcess_v2.2.R

diff --git a/Generate_plody_priors_table.R b/Generate_plody_priors_table.R
new file mode 100644
index 00000000..0c476d34
--- /dev/null
+++ b/Generate_plody_priors_table.R
@@ -0,0 +1,100 @@
+### Generate sequenza ploidy priors table based on PANCAN Ascat results
+### Script from Jeffrey Bruce, modified by Peter Ruzanov
+
+library(openxlsx)
+cmd_args=commandArgs(trailingOnly = TRUE)
+FILE    = cmd_args[1]
+purity_ploidy<-read.xlsx(
+  FILE,
+  sheet = 1,
+  startRow = 1,
+  colNames = TRUE,
+  rowNames = FALSE,
+  detectDates = FALSE,
+  skipEmptyRows = TRUE,
+  skipEmptyCols = TRUE,
+  rows = NULL,
+  cols = NULL,
+  check.names = FALSE,
+  namedRegion = NULL,
+  na.strings = "NA",
+  fillMergedCells = FALSE
+)
+
+cancer_types <- unique(purity_ploidy$tumor_type)
+
+# LAML 	Acute Myeloid Leukemia
+# ACC 	Adrenocortical carcinoma
+# BLCA 	Bladder Urothelial Carcinoma
+# LGG 	Brain Lower Grade Glioma
+# BRCA 	Breast invasive carcinoma
+# CESC 	Cervical squamous cell carcinoma and endocervical adenocarcinoma
+# CHOL 	Cholangiocarcinoma
+# LCML 	Chronic Myelogenous Leukemia
+# COAD 	Colon adenocarcinoma
+# CNTL 	Controls
+# ESCA 	Esophageal carcinoma
+# FPPP 	FFPE Pilot Phase II
+# GBM 	Glioblastoma multiforme
+# HNSC 	Head and Neck squamous cell carcinoma
+# KICH 	Kidney Chromophobe
+# KIRC 	Kidney renal clear cell carcinoma
+# KIRP 	Kidney renal papillary cell carcinoma
+# LIHC 	Liver hepatocellular carcinoma
+# LUAD 	Lung adenocarcinoma
+# LUSC 	Lung squamous cell carcinoma
+# DLBC 	Lymphoid Neoplasm Diffuse Large B-cell Lymphoma
+# MESO 	Mesothelioma
+# MISC 	Miscellaneous
+# OV 	Ovarian serous cystadenocarcinoma
+# PAAD 	Pancreatic adenocarcinoma
+# PCPG 	Pheochromocytoma and Paraganglioma
+# PRAD 	Prostate adenocarcinoma
+# READ 	Rectum adenocarcinoma
+# SARC 	Sarcoma
+# SKCM 	Skin Cutaneous Melanoma
+# STAD 	Stomach adenocarcinoma
+# TGCT 	Testicular Germ Cell Tumors
+# THYM 	Thymoma
+# THCA 	Thyroid carcinoma
+# UCS 	Uterine Carcinosarcoma
+# UCEC 	Uterine Corpus Endometrial Carcinoma
+# UVM 	Uveal Melanoma
+
+ploidy_table<- data.frame(CN = "", 
+                          value = "",
+                          cancer_type="")[0,]
+
+for (cancer in cancer_types[which(cancer_types != "NA")]) {
+  tab <- purity_ploidy[purity_ploidy$tumor_type==cancer,]
+  num <- nrow(tab)
+  
+  tmp_frame <- data.frame(CN = 0:7, 
+                          value = rep(0,8),
+                          cancer_type=rep(cancer,8))
+  
+  for(i in 0:7){
+    tmp_frame[i+1,"value"] <- sum(tab$ploidy>(i-0.5) & 
+                                  tab$ploidy <= (i+0.5),na.rm=TRUE)/num
+  }
+  
+  ploidy_table <- rbind(ploidy_table,tmp_frame)
+}
+
+## add the percentages for all cancers
+tmp_frame <- data.frame(CN = 0:7, 
+                        value = rep(0,8),
+                        cancer_type=rep("all",8))
+
+num <- nrow(purity_ploidy)
+for(i in 0:7){
+  tmp_frame[i+1,"value"] <- sum(purity_ploidy$ploidy>(i-0.5) & 
+                                purity_ploidy$ploidy <= (i+0.5),na.rm=TRUE)/num
+}
+
+ploidy_table <- rbind(ploidy_table,tmp_frame)
+
+
+save(ploidy_table,
+     file="PANCAN_ASCAT_ploidy_prob.Rdata")
+
diff --git a/SequenzaPreProcess_v2.2.R b/SequenzaPreProcess_v2.2.R
new file mode 100755
index 00000000..8380cbf5
--- /dev/null
+++ b/SequenzaPreProcess_v2.2.R
@@ -0,0 +1,49 @@
+library(sequenza)
+library(optparse)
+
+## For now, test if commands are in original, trailing format, or new opt-parse format
+option_list = list(
+  make_option(c("-s", "--snp_file"), type="character", default=NULL, 
+              help="varscan snp file name", metavar="character"),
+  make_option(c("-c", "--cnv_file"), type="character", default=NULL,
+              help="varscan copy number file name [default= %default]", metavar="character"),
+  make_option(c("-y", "--remove_Y_vars"), type="logical", default=TRUE,
+            help="Remove Y chromosome variants? [default= %default]", metavar="logical"),
+  make_option(c("-p", "--prefix"), type="character", default=NULL,
+            help="Output prefix for processed data file [default= %default]", metavar="character")
+ ); 
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+snp.file <- opt$snp_file
+cnv.file <- opt$cnv_file
+remove_Y <- opt$remove_Y_vars
+prefix   <- opt$prefix
+
+print("Getting SeqZ file for Sequenza:")
+
+########
+# MAIN #
+########
+
+## read the snp and cnv files
+snp <- read.table(snp.file,sep="\t",header = TRUE)
+cnv <- read.table(cnv.file,sep="\t",header = TRUE)
+
+## remove chromosome M because it seems to cause problems
+snp <-subset(snp,chrom!="chrM")
+cnv <-subset(cnv,chrom!="chrM")
+
+## remove chromosome Y because it seems to cause problems
+if(remove_Y){
+snp<-subset(snp,chrom!="chrY")
+cnv<-subset(cnv,chrom!="chrY")
+}
+
+##### prepare sequenza data file
+seqz.data <- VarScan2seqz(varscan.somatic = snp, 
+                          varscan.copynumber = cnv)
+
+file <- paste0(prefix, ".seqz")
+write.table(seqz.data, file, col.names = TRUE, row.names = FALSE, sep = "\t")
diff --git a/SequenzaProcess_v2.2.R b/SequenzaProcess_v2.2.R
new file mode 100755
index 00000000..74bf3fcb
--- /dev/null
+++ b/SequenzaProcess_v2.2.R
@@ -0,0 +1,62 @@
+library(sequenza)
+library(optparse)
+
+## For now, test if commands are in original, trailing format, or new opt-parse format
+option_list = list(
+  make_option(c("-s", "--seqz_file"), type="character", default=NULL,
+              help="varscan snp file name", metavar="character"),
+  make_option(c("-l", "--ploidy_file"), type="character", default=NULL,
+              help="ploidy file name", metavar="character"),
+  make_option(c("-p", "--prefix"), type="character", default=NULL,
+            help="Output prefix for result files [default= %default]", metavar="character"),
+  make_option(c("-g", "--gamma"), type="integer", default=80,
+            help="Gamma parameter for data extraction [default= %default]", metavar="integer"),
+  make_option(c("-f", "--female"), type="logical", default=TRUE,
+            help="Female Sex flag [default= %default]", metavar="logical"),
+  make_option(c("-w", "--window"), type="integer", default=50,
+            help="Window for detection of CNVs [default= %default]", metavar="integer"),
+  make_option(c("-t", "--type"), type="character", default="PCGP",
+            help="Window for detection of CNVs [default= %default]", metavar="character")
+ );
+
+opt_parser = OptionParser(option_list=option_list);
+opt = parse_args(opt_parser);
+
+FILE   <- opt$seqz_file
+SAMPLE <- opt$prefix
+PLOIDY <- opt$ploidy_file
+GAMMA  <- opt$gamma
+FEMALE <- opt$female
+WINDOW <- opt$window
+TYPE   <- opt$type
+
+# ======================= PREPROCESSING ===================================
+EXTR  <- sequenza.extract(FILE, gz = FALSE,
+                          breaks.method = "het",
+                          window = WINDOW,       # expose
+                          gamma  = GAMMA,        # expose
+                          min.reads.normal = 10, 
+                          min.reads.baf = 1)     
+
+print("Extract Ok")
+ratio_priority = FALSE
+
+load(file=PLOIDY) # goes to module
+priors <- subset(ploidy_table,cancer_type==toupper(TYPE))[c("CN","value")]
+CP.example <- sequenza.fit(EXTR,
+                           priors.table = priors,
+                           method = "mufreq",          
+                           ratio.priority = ratio_priority,
+                           chromosome.list = 1:22)
+
+print("Fit Ok")
+sequenza.results(EXTR, 
+                 cp.table = CP.example, 
+                 SAMPLE, 
+                 out.dir = getwd(),
+                 female = FEMALE,     # expose
+                 CNt.max = 20,
+                 ratio.priority = ratio_priority, 
+                 XY = c(X = "chrX", Y = "chrY"),
+                 chromosome.list = 1:22)
+print("Results Ok")