Skip to content

Commit

Permalink
Adding .R scripts for wdl workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
pruzanov committed Mar 13, 2020
1 parent 987110f commit 3b5a536
Show file tree
Hide file tree
Showing 3 changed files with 211 additions and 0 deletions.
100 changes: 100 additions & 0 deletions Generate_plody_priors_table.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
### Generate sequenza ploidy priors table based on PANCAN Ascat results
### Script from Jeffrey Bruce, modified by Peter Ruzanov

library(openxlsx)
cmd_args=commandArgs(trailingOnly = TRUE)
FILE = cmd_args[1]
purity_ploidy<-read.xlsx(
FILE,
sheet = 1,
startRow = 1,
colNames = TRUE,
rowNames = FALSE,
detectDates = FALSE,
skipEmptyRows = TRUE,
skipEmptyCols = TRUE,
rows = NULL,
cols = NULL,
check.names = FALSE,
namedRegion = NULL,
na.strings = "NA",
fillMergedCells = FALSE
)

cancer_types <- unique(purity_ploidy$tumor_type)

# LAML Acute Myeloid Leukemia
# ACC Adrenocortical carcinoma
# BLCA Bladder Urothelial Carcinoma
# LGG Brain Lower Grade Glioma
# BRCA Breast invasive carcinoma
# CESC Cervical squamous cell carcinoma and endocervical adenocarcinoma
# CHOL Cholangiocarcinoma
# LCML Chronic Myelogenous Leukemia
# COAD Colon adenocarcinoma
# CNTL Controls
# ESCA Esophageal carcinoma
# FPPP FFPE Pilot Phase II
# GBM Glioblastoma multiforme
# HNSC Head and Neck squamous cell carcinoma
# KICH Kidney Chromophobe
# KIRC Kidney renal clear cell carcinoma
# KIRP Kidney renal papillary cell carcinoma
# LIHC Liver hepatocellular carcinoma
# LUAD Lung adenocarcinoma
# LUSC Lung squamous cell carcinoma
# DLBC Lymphoid Neoplasm Diffuse Large B-cell Lymphoma
# MESO Mesothelioma
# MISC Miscellaneous
# OV Ovarian serous cystadenocarcinoma
# PAAD Pancreatic adenocarcinoma
# PCPG Pheochromocytoma and Paraganglioma
# PRAD Prostate adenocarcinoma
# READ Rectum adenocarcinoma
# SARC Sarcoma
# SKCM Skin Cutaneous Melanoma
# STAD Stomach adenocarcinoma
# TGCT Testicular Germ Cell Tumors
# THYM Thymoma
# THCA Thyroid carcinoma
# UCS Uterine Carcinosarcoma
# UCEC Uterine Corpus Endometrial Carcinoma
# UVM Uveal Melanoma

ploidy_table<- data.frame(CN = "",
value = "",
cancer_type="")[0,]

for (cancer in cancer_types[which(cancer_types != "NA")]) {
tab <- purity_ploidy[purity_ploidy$tumor_type==cancer,]
num <- nrow(tab)

tmp_frame <- data.frame(CN = 0:7,
value = rep(0,8),
cancer_type=rep(cancer,8))

for(i in 0:7){
tmp_frame[i+1,"value"] <- sum(tab$ploidy>(i-0.5) &
tab$ploidy <= (i+0.5),na.rm=TRUE)/num
}

ploidy_table <- rbind(ploidy_table,tmp_frame)
}

## add the percentages for all cancers
tmp_frame <- data.frame(CN = 0:7,
value = rep(0,8),
cancer_type=rep("all",8))

num <- nrow(purity_ploidy)
for(i in 0:7){
tmp_frame[i+1,"value"] <- sum(purity_ploidy$ploidy>(i-0.5) &
purity_ploidy$ploidy <= (i+0.5),na.rm=TRUE)/num
}

ploidy_table <- rbind(ploidy_table,tmp_frame)


save(ploidy_table,
file="PANCAN_ASCAT_ploidy_prob.Rdata")

49 changes: 49 additions & 0 deletions SequenzaPreProcess_v2.2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
library(sequenza)
library(optparse)

## For now, test if commands are in original, trailing format, or new opt-parse format
option_list = list(
make_option(c("-s", "--snp_file"), type="character", default=NULL,
help="varscan snp file name", metavar="character"),
make_option(c("-c", "--cnv_file"), type="character", default=NULL,
help="varscan copy number file name [default= %default]", metavar="character"),
make_option(c("-y", "--remove_Y_vars"), type="logical", default=TRUE,
help="Remove Y chromosome variants? [default= %default]", metavar="logical"),
make_option(c("-p", "--prefix"), type="character", default=NULL,
help="Output prefix for processed data file [default= %default]", metavar="character")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

snp.file <- opt$snp_file
cnv.file <- opt$cnv_file
remove_Y <- opt$remove_Y_vars
prefix <- opt$prefix

print("Getting SeqZ file for Sequenza:")

########
# MAIN #
########

## read the snp and cnv files
snp <- read.table(snp.file,sep="\t",header = TRUE)
cnv <- read.table(cnv.file,sep="\t",header = TRUE)

## remove chromosome M because it seems to cause problems
snp <-subset(snp,chrom!="chrM")
cnv <-subset(cnv,chrom!="chrM")

## remove chromosome Y because it seems to cause problems
if(remove_Y){
snp<-subset(snp,chrom!="chrY")
cnv<-subset(cnv,chrom!="chrY")
}

##### prepare sequenza data file
seqz.data <- VarScan2seqz(varscan.somatic = snp,
varscan.copynumber = cnv)

file <- paste0(prefix, ".seqz")
write.table(seqz.data, file, col.names = TRUE, row.names = FALSE, sep = "\t")
62 changes: 62 additions & 0 deletions SequenzaProcess_v2.2.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
library(sequenza)
library(optparse)

## For now, test if commands are in original, trailing format, or new opt-parse format
option_list = list(
make_option(c("-s", "--seqz_file"), type="character", default=NULL,
help="varscan snp file name", metavar="character"),
make_option(c("-l", "--ploidy_file"), type="character", default=NULL,
help="ploidy file name", metavar="character"),
make_option(c("-p", "--prefix"), type="character", default=NULL,
help="Output prefix for result files [default= %default]", metavar="character"),
make_option(c("-g", "--gamma"), type="integer", default=80,
help="Gamma parameter for data extraction [default= %default]", metavar="integer"),
make_option(c("-f", "--female"), type="logical", default=TRUE,
help="Female Sex flag [default= %default]", metavar="logical"),
make_option(c("-w", "--window"), type="integer", default=50,
help="Window for detection of CNVs [default= %default]", metavar="integer"),
make_option(c("-t", "--type"), type="character", default="PCGP",
help="Window for detection of CNVs [default= %default]", metavar="character")
);

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

FILE <- opt$seqz_file
SAMPLE <- opt$prefix
PLOIDY <- opt$ploidy_file
GAMMA <- opt$gamma
FEMALE <- opt$female
WINDOW <- opt$window
TYPE <- opt$type

# ======================= PREPROCESSING ===================================
EXTR <- sequenza.extract(FILE, gz = FALSE,
breaks.method = "het",
window = WINDOW, # expose
gamma = GAMMA, # expose
min.reads.normal = 10,
min.reads.baf = 1)

print("Extract Ok")
ratio_priority = FALSE

load(file=PLOIDY) # goes to module
priors <- subset(ploidy_table,cancer_type==toupper(TYPE))[c("CN","value")]
CP.example <- sequenza.fit(EXTR,
priors.table = priors,
method = "mufreq",
ratio.priority = ratio_priority,
chromosome.list = 1:22)

print("Fit Ok")
sequenza.results(EXTR,
cp.table = CP.example,
SAMPLE,
out.dir = getwd(),
female = FEMALE, # expose
CNt.max = 20,
ratio.priority = ratio_priority,
XY = c(X = "chrX", Y = "chrY"),
chromosome.list = 1:22)
print("Results Ok")

0 comments on commit 3b5a536

Please sign in to comment.