-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathanalyze_vcf.R
133 lines (94 loc) · 5.82 KB
/
analyze_vcf.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# Script to visualize aspects of data in VCF files.
# Dependencies: 'variantannotation' package in 'bioconductor'
# Written by Jamie Walters (jrw47 [at] stanford __dot__ edu)
# Run as:
#% Rscript analyze_vcf.R myvariants.vcf output_file
#Version 2 is amended to deal with missing data which crops up when running on large data files.
args <- commandArgs(TRUE)
library(VariantAnnotation)
#setwd("~/Documents/PostDoc_Research/MK-tests_Hmelpomene/vcf_R_scripts")
#file = "~/Documents/PostDoc_Research/MK-tests_Hmelpomene/vcf_R_scripts/test_amaagl.vcf"
#basename = "out"
file <- args[1]
basename <- args[2]
vcf <- readVcf(file, "Hmel")
hdr <- exptData(vcf)[["header"]]
####################################
# Plot distribution of QUAL score #
####################################
quals <- vcf@fixed$QUAL
mean.qual <- mean(quals)
median.qual <- median(quals)
sd.qual <- sd(quals)
bin <-100
png(paste(basename,"_QualHist.png", sep=""), height=5,width=5,units="in", res=200)
hist(quals, breaks=seq(0,max(quals)+bin,bin), col="dark green", border="dark green", xlim=c(0, mean.qual+5*sd.qual),
main="Distribution of SNP Quality Scores", xlab="SNP Quality Score", cex.axis=0.8)
dev.off()
###############################
# Plot distribution of depth #
###############################
depth <- vcf@info$DP
depth.mean <- mean(depth)
depth.median <- median(depth)
depth.sd <- sd(depth)
bin <-5
png(paste(basename,"_DepthHist.png", sep=""), height=5,width=5,units="in", res=200)
hist(depth, breaks=seq(0,max(depth)+bin,bin), col="dark green", border="dark green", xlim=c(0, depth.mean+5*depth.sd),
main="Distribution of Total Depth per SNP", xlab="Summed Coverage per SNP", cex.axis=0.8)
dev.off()
#col2rgb("black")
png(paste(basename,"_QualByDepth.png", sep=""), height=5,width=5,units="in", res=200)
plot (quals ~ depth, xlim=c(0, depth.mean+5*depth.sd), ylim=c(0, mean.qual+5*sd.qual), pch=19, cex=.3, col=rgb(0,0,0,.05),
xlab="Summed Coverage per SNP", ylab="SNP Quality Score", main="Coverage Depth ~ SNP Quality", cex.axis=0.8 )
dev.off()
##################################
# Count & Plot numbers of empty #
# genotypes per sample and site #
##################################
count.empty.genos <- function (x) {
length(grep('./.',x, fixed=T))
}
empty.genos.sample <- apply(geno(vcf)$GT, 2, count.empty.genos ) # apply to rows
empty.genos.sites <- apply(geno(vcf)$GT, 1, count.empty.genos ) # apply to cols
#plot frequency of sites missing N samples
png(paste(basename,"_EmptyGeno-Sites.png", sep=""), height=5,width=5,units="in", res=200)
barplot(table(empty.genos.sites[empty.genos.sites > 0]), main = "Missing genotypes per site", ylab="Number of sites", xlab="Count of missing genotypes", sub=paste(dim(vcf)[1], "sites counted"), cex.lab=1.2, col="light blue" )
dev.off()
#Plot counts sites missing data per sample
png(paste(basename,"_EmptyGeno-Samples.png", sep=""), height=5,width=5,units="in", res=200)
par(oma=c(4,1,0.5,1))
barplot(empty.genos.sample, las=2, main = "Missing genotypes per sample", ylab="Number of sites", cex.lab=1.2, col="light blue", cex.names=.7, )
mtext( paste(dim(vcf)[1], "sites counted"), outer=T, padj=4, 1)
dev.off()
write.table(table(empty.genos.sites),file=paste(basename,"_EmptyGeno-Sites.txt", sep=""), quote=F, sep="\t", row.names=F )
write.table(empty.genos.sample, file=paste(basename,"_EmptyGeno-Samples.txt", sep=""), quote=F, sep="\t", row.names=T, col.names=F)
##################################
#Plot allele frequencies & counts#
##################################
af <- unlist(vcf@info$AF) # get allele frequncies
ac <- unlist(vcf@info$AC) # get allele counts
#Allele freqs
af.table.all <- table(af)
af.table.complete <- table(af[empty.genos.sites == 0])
png(paste(basename,"_SFS_completeGenos.png", sep=""), height=5,width=5,units="in", res=200)
barplot(af.table.complete/sum(af.table.complete), main="Allele Frequency Spectrum", ylab= "Proportion of SNPs", xlab="Allele Frequency", col="light blue", las=2, cex.lab=1.2, cex.names=.5, sub=paste(sum(af.table.complete), "sites with complete genotypes counted") )
dev.off()
out.af.complete <- cbind(names(af.table.complete),af.table.complete, af.table.complete/sum(af.table.complete)) # create object for writing to file
out.af.all <- cbind(names(af.table.all),af.table.all, af.table.all/sum(af.table.all)) # create object for writing to file
write.table(out.af.complete, file=paste(basename,"_SFS_completeGenos.txt", sep=""), quote=F, sep="\t",
row.names=F, col.names=c("Allele_Frequency","Count","Proportion"))
write.table(out.af.all, file=paste(basename,"_SFS_allsites.txt", sep=""), quote=F, sep="\t",
row.names=F, col.names=c("Allele_Frequency","Count","Proportion"))
#Allele counts
ac.table.all <- table(ac)
ac.table.complete <- table(ac[empty.genos.sites == 0])
png(paste(basename,"_AlleleCountSpectrum.png", sep=""), height=5,width=5,units="in", res=200)
barplot(ac.table.complete/sum(ac.table.complete), main="Allele Frequency Spectrum", ylab= "Proportion of SNPs", xlab="Allele Count", col="light blue", las=1, cex.lab=1.2, cex.names=.9, sub=paste(sum(ac.table.complete), "sites with complete genotypes counted") )
dev.off()
out.ac.complete <- cbind(names(ac.table.complete),ac.table.complete, ac.table.complete/sum(ac.table.complete)) # create object for writing to file
out.ac.all <- cbind(names(ac.table.all),ac.table.all, ac.table.all/sum(ac.table.all)) # create object for writing to file
write.table(out.ac.complete, file=paste(basename,"_AlleleCounts_completeGenos.txt", sep=""), quote=F, sep="\t",
row.names=F, col.names=c("Allele_Count","Site_Count","Proportion"))
write.table(out.af.all, file=paste(basename,"_AlleleCounts_allsites.txt", sep=""), quote=F, sep="\t",
row.names=F, col.names=c("Allele_Count","Site_Count","Proportion"))