From 3b5a5364382b10841919b0730a69874973dc437f Mon Sep 17 00:00:00 2001 From: Peter Ruzanov Date: Fri, 13 Mar 2020 13:19:54 -0400 Subject: [PATCH] Adding .R scripts for wdl workflow --- Generate_plody_priors_table.R | 100 ++++++++++++++++++++++++++++++++++ SequenzaPreProcess_v2.2.R | 49 +++++++++++++++++ SequenzaProcess_v2.2.R | 62 +++++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 Generate_plody_priors_table.R create mode 100755 SequenzaPreProcess_v2.2.R create mode 100755 SequenzaProcess_v2.2.R diff --git a/Generate_plody_priors_table.R b/Generate_plody_priors_table.R new file mode 100644 index 00000000..0c476d34 --- /dev/null +++ b/Generate_plody_priors_table.R @@ -0,0 +1,100 @@ +### Generate sequenza ploidy priors table based on PANCAN Ascat results +### Script from Jeffrey Bruce, modified by Peter Ruzanov + +library(openxlsx) +cmd_args=commandArgs(trailingOnly = TRUE) +FILE = cmd_args[1] +purity_ploidy<-read.xlsx( + FILE, + sheet = 1, + startRow = 1, + colNames = TRUE, + rowNames = FALSE, + detectDates = FALSE, + skipEmptyRows = TRUE, + skipEmptyCols = TRUE, + rows = NULL, + cols = NULL, + check.names = FALSE, + namedRegion = NULL, + na.strings = "NA", + fillMergedCells = FALSE +) + +cancer_types <- unique(purity_ploidy$tumor_type) + +# LAML Acute Myeloid Leukemia +# ACC Adrenocortical carcinoma +# BLCA Bladder Urothelial Carcinoma +# LGG Brain Lower Grade Glioma +# BRCA Breast invasive carcinoma +# CESC Cervical squamous cell carcinoma and endocervical adenocarcinoma +# CHOL Cholangiocarcinoma +# LCML Chronic Myelogenous Leukemia +# COAD Colon adenocarcinoma +# CNTL Controls +# ESCA Esophageal carcinoma +# FPPP FFPE Pilot Phase II +# GBM Glioblastoma multiforme +# HNSC Head and Neck squamous cell carcinoma +# KICH Kidney Chromophobe +# KIRC Kidney renal clear cell carcinoma +# KIRP Kidney renal papillary cell carcinoma +# LIHC Liver hepatocellular carcinoma +# LUAD Lung adenocarcinoma +# LUSC Lung squamous cell carcinoma +# DLBC Lymphoid Neoplasm Diffuse Large B-cell Lymphoma +# MESO Mesothelioma +# MISC Miscellaneous +# OV Ovarian serous cystadenocarcinoma +# PAAD Pancreatic adenocarcinoma +# PCPG Pheochromocytoma and Paraganglioma +# PRAD Prostate adenocarcinoma +# READ Rectum adenocarcinoma +# SARC Sarcoma +# SKCM Skin Cutaneous Melanoma +# STAD Stomach adenocarcinoma +# TGCT Testicular Germ Cell Tumors +# THYM Thymoma +# THCA Thyroid carcinoma +# UCS Uterine Carcinosarcoma +# UCEC Uterine Corpus Endometrial Carcinoma +# UVM Uveal Melanoma + +ploidy_table<- data.frame(CN = "", + value = "", + cancer_type="")[0,] + +for (cancer in cancer_types[which(cancer_types != "NA")]) { + tab <- purity_ploidy[purity_ploidy$tumor_type==cancer,] + num <- nrow(tab) + + tmp_frame <- data.frame(CN = 0:7, + value = rep(0,8), + cancer_type=rep(cancer,8)) + + for(i in 0:7){ + tmp_frame[i+1,"value"] <- sum(tab$ploidy>(i-0.5) & + tab$ploidy <= (i+0.5),na.rm=TRUE)/num + } + + ploidy_table <- rbind(ploidy_table,tmp_frame) +} + +## add the percentages for all cancers +tmp_frame <- data.frame(CN = 0:7, + value = rep(0,8), + cancer_type=rep("all",8)) + +num <- nrow(purity_ploidy) +for(i in 0:7){ + tmp_frame[i+1,"value"] <- sum(purity_ploidy$ploidy>(i-0.5) & + purity_ploidy$ploidy <= (i+0.5),na.rm=TRUE)/num +} + +ploidy_table <- rbind(ploidy_table,tmp_frame) + + +save(ploidy_table, + file="PANCAN_ASCAT_ploidy_prob.Rdata") + diff --git a/SequenzaPreProcess_v2.2.R b/SequenzaPreProcess_v2.2.R new file mode 100755 index 00000000..8380cbf5 --- /dev/null +++ b/SequenzaPreProcess_v2.2.R @@ -0,0 +1,49 @@ +library(sequenza) +library(optparse) + +## For now, test if commands are in original, trailing format, or new opt-parse format +option_list = list( + make_option(c("-s", "--snp_file"), type="character", default=NULL, + help="varscan snp file name", metavar="character"), + make_option(c("-c", "--cnv_file"), type="character", default=NULL, + help="varscan copy number file name [default= %default]", metavar="character"), + make_option(c("-y", "--remove_Y_vars"), type="logical", default=TRUE, + help="Remove Y chromosome variants? [default= %default]", metavar="logical"), + make_option(c("-p", "--prefix"), type="character", default=NULL, + help="Output prefix for processed data file [default= %default]", metavar="character") + ); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +snp.file <- opt$snp_file +cnv.file <- opt$cnv_file +remove_Y <- opt$remove_Y_vars +prefix <- opt$prefix + +print("Getting SeqZ file for Sequenza:") + +######## +# MAIN # +######## + +## read the snp and cnv files +snp <- read.table(snp.file,sep="\t",header = TRUE) +cnv <- read.table(cnv.file,sep="\t",header = TRUE) + +## remove chromosome M because it seems to cause problems +snp <-subset(snp,chrom!="chrM") +cnv <-subset(cnv,chrom!="chrM") + +## remove chromosome Y because it seems to cause problems +if(remove_Y){ +snp<-subset(snp,chrom!="chrY") +cnv<-subset(cnv,chrom!="chrY") +} + +##### prepare sequenza data file +seqz.data <- VarScan2seqz(varscan.somatic = snp, + varscan.copynumber = cnv) + +file <- paste0(prefix, ".seqz") +write.table(seqz.data, file, col.names = TRUE, row.names = FALSE, sep = "\t") diff --git a/SequenzaProcess_v2.2.R b/SequenzaProcess_v2.2.R new file mode 100755 index 00000000..74bf3fcb --- /dev/null +++ b/SequenzaProcess_v2.2.R @@ -0,0 +1,62 @@ +library(sequenza) +library(optparse) + +## For now, test if commands are in original, trailing format, or new opt-parse format +option_list = list( + make_option(c("-s", "--seqz_file"), type="character", default=NULL, + help="varscan snp file name", metavar="character"), + make_option(c("-l", "--ploidy_file"), type="character", default=NULL, + help="ploidy file name", metavar="character"), + make_option(c("-p", "--prefix"), type="character", default=NULL, + help="Output prefix for result files [default= %default]", metavar="character"), + make_option(c("-g", "--gamma"), type="integer", default=80, + help="Gamma parameter for data extraction [default= %default]", metavar="integer"), + make_option(c("-f", "--female"), type="logical", default=TRUE, + help="Female Sex flag [default= %default]", metavar="logical"), + make_option(c("-w", "--window"), type="integer", default=50, + help="Window for detection of CNVs [default= %default]", metavar="integer"), + make_option(c("-t", "--type"), type="character", default="PCGP", + help="Window for detection of CNVs [default= %default]", metavar="character") + ); + +opt_parser = OptionParser(option_list=option_list); +opt = parse_args(opt_parser); + +FILE <- opt$seqz_file +SAMPLE <- opt$prefix +PLOIDY <- opt$ploidy_file +GAMMA <- opt$gamma +FEMALE <- opt$female +WINDOW <- opt$window +TYPE <- opt$type + +# ======================= PREPROCESSING =================================== +EXTR <- sequenza.extract(FILE, gz = FALSE, + breaks.method = "het", + window = WINDOW, # expose + gamma = GAMMA, # expose + min.reads.normal = 10, + min.reads.baf = 1) + +print("Extract Ok") +ratio_priority = FALSE + +load(file=PLOIDY) # goes to module +priors <- subset(ploidy_table,cancer_type==toupper(TYPE))[c("CN","value")] +CP.example <- sequenza.fit(EXTR, + priors.table = priors, + method = "mufreq", + ratio.priority = ratio_priority, + chromosome.list = 1:22) + +print("Fit Ok") +sequenza.results(EXTR, + cp.table = CP.example, + SAMPLE, + out.dir = getwd(), + female = FEMALE, # expose + CNt.max = 20, + ratio.priority = ratio_priority, + XY = c(X = "chrX", Y = "chrY"), + chromosome.list = 1:22) +print("Results Ok")