-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsubset-gds.wdl
99 lines (88 loc) · 2.66 KB
/
subset-gds.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
version 1.0
# [1] subsetGDSchr -- subsets a GDS file based on defined input variants
task subsetGDSchr {
input {
File gds_file
File variants_file
# runtime attributes
Int addldisk = 1
Int cpu = 4
Int memory = 8
Int preempt = 3
}
command <<<
echo "Writing R script to local file"
cat > subsetgdsfile.R << EOF
args = commandArgs(trailingOnly=TRUE)
gds_filename <- args[1]
variants_filename <- args[2]
# Install SeqArray if needed
if("SeqArray" %in% rownames(installed.packages()) == FALSE) {
if (!requireNamespace("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("SeqArray")
}
# Load libraries
library(SeqArray)
# Read in files
system(paste("cp", gds_filename, "."))
variants_file <- read.csv(variants_filename, header = T)
gds_file <- seqOpen(basename(gds_filename), readonly = F)
# Get variants that need to be extracted
colnames(variants_file) <- tolower(colnames(variants_file))
colnames(variants_file) <- gsub("x...", "", colnames(variants_file))
if("id" %in% colnames(variants_file)) {
snps_include <- variants_file[,"id"]
} else {
snps_include <- paste(variants_file[,"chr"],variants_file[,"pos"],variants_file[,"ref"], variants_file[,"alt"], sep = "_")
}
# Get chromosome, position, alleles from GDS
chromosome <- seqGetData(gds_file, "chromosome")
position <- seqGetData(gds_file, "position")
allele <- seqGetData(gds_file, "allele")
# Split alleles variable to be ref and alt
ref <- sub(",.*$", "", allele)
alt <- sub("^.*,", "", allele)
# Add variant ID column
seqAddValue(gds_file, "variant.id", paste(chromosome, position, ref, alt, sep = "_"), replace = T)
# Filter to the variants
seqSetFilter(gds_file, variant.id=snps_include)
# Export file name
subfile <- paste(sub("\\.gds", "", basename(gds_filename)), "subset", "gds", sep = ".")
seqExport(gds_file, subfile)
EOF
echo "Calling R script subsetgdsfile.R"
Rscript subsetgdsfile.R ~{gds_file} ~{variants_file}
>>>
# Estimate disk size required
Int gds_size = ceil(size(gds_file, "GB"))
Int finalDiskSize = 4*gds_size + addldisk
runtime {
cpu: cpu
docker: "uwgac/topmed-master@sha256:0bb7f98d6b9182d4e4a6b82c98c04a244d766707875ddfd8a48005a9f5c5481e"
disks: "local-disk " + finalDiskSize + " HDD"
memory: "${memory} GB"
preemptibles: "${preempt}"
}
output {
File gds_output = glob("*.gds")[1]
}
}
workflow subsetGDS {
input {
File gds_file
File variants_file
}
call subsetGDSchr {
input:
gds_file = gds_file,
variants_file = variants_file
}
output {
File gds_output = subsetGDSchr.gds_output
}
meta {
author: "Sarah Hsu"
email: "[email protected]"
}
}