Merge remote-tracking branch 'upstream/master'

grunwaldlab · Apr 17, 2018 · c7d96d2 · c7d96d2
2 parents 8be1379 + 47df2e6
commit c7d96d2
Show file tree

Hide file tree

Showing 38 changed files with 917 additions and 261 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,7 +8,7 @@ Description: Facilitates easy manipulation of variant call format (VCF) data.
     data may be written to a VCF file (*.vcf.gz). It also may be converted into
     other popular R objects (e.g., genlight, DNAbin). VcfR provides a link between
     VCF data and familiar R software.
-Version: 1.7.0
+Version: 1.8.0
 Authors@R: c(
     person(c("Brian", "J."), "Knaus", role = c("cre", "aut"),
     email = "[email protected]", comment = c(ORCID = "0000-0003-1665-4343")),
@@ -41,7 +41,6 @@ Imports:
     stats,
     stringr,
     tibble,
-    tidyr,
     utils,
     vegan,
     viridisLite
@@ -53,7 +52,8 @@ Suggests:
     reshape2,
     rmarkdown,
     scales,
-    testthat
+    testthat,
+    tidyr
 VignetteBuilder: knitr
 License: GPL
 RoxygenNote: 6.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,7 @@ export(NM2winNM)
 export(addID)
 export(alleles2consensus)
 export(ann2chromR)
+export(check_keys)
 export(chromR2vcfR)
 export(chromo)
 export(chromoqc)

diff --git a/NEWS.md b/NEWS.md
@@ -6,6 +6,17 @@ At the present, this is simply a to-do list for ideas to include in the next maj
 
 * Move 'FORMAT' column to its own slot. We can then cbind FORMAT and gt when passing to compiled code.
 This may have been addressed at 64a308ba50b9119108e8946737460de5997b805b by adding `samples` to vcfR method `[`.
+* In issue #92 (vcfR2genlight big data #92), JimWhiting91 has documented that `extract.gt()` could be greatly improved with multithreading. While he used `mclapply()` I do not feel this is the best solution because it does not work on Windows. I think a better solution would be [RCppParallel](https://rcppcore.github.io/RcppParallel/) because this should work on all CRAN platforms.
+
+
+# vcfR 1.8.0
+Released on CRAN 2018-04-17
+* Attempted to address CRAN's 'Note: break used in wrong context: no loop is visible' issue.
+* `.vcf_stats_gz()` reports number of elements in header as well as the file's last line. This is used by `read.vcfR()` to check for poorly formed files.
+* `show` method for vcfR now queries @fix instead of @gt.
+* `check_keys()` checks key definitions in the meta section to make sure they are unique.
+* `freq_peak_plot()` has parameter `posUnits` to adjust units of scatterplot.
+* `vcfR2migrate()` manual discusses Unix and Windows line endings.
 
 
 # vcfR 1.7.0
@@ -30,7 +41,7 @@ Released on CRAN 2017-12-08.
 * removed `.Call()` statements to standardize style.
 * Created `vcfR2migrate()` to output MigrateN format data.
 * Addressed clang-UBSAN memory leak in `freq_peak()`.
-* Created `pairwise_genetic_diff()` to calculate pairwise differentiation.
+* Created `pairwise_genetic_diff()` to calculate pairwise differentiation. Thanks Javier!
 
 # vcfR 1.5.0
 Released on CRAN 2017-05-18.

diff --git a/R/check_keys.R b/R/check_keys.R
@@ -0,0 +1,58 @@
+
+
+#### check_keys ####
+#' @rdname check_keys
+#' @aliases check_keys
+#' 
+#' @title Check that INFO and FORMAT keys are unique
+#' 
+#' @param x an oblect of class vcfR
+#' 
+#' @description 
+#' The INFO and FORMAT columns contain information in key-value pairs.
+#' If for some reason a key is not unique it will create issues in retrieving this information.
+#' This function checks the keys defined in the meta section to make sure they are unique.
+#' Note that it does not actually check the INFO and FORMAT columns, just their definitions in the meta section.
+#' This is because each variant can have different information in their INFO and META cells.
+#' Checking these on large files will tehrefore come with a performance cost.
+#' 
+#' @seealso queryMETA()
+#' 
+#' @examples 
+#' data(vcfR_test)
+#' check_keys(vcfR_test)
+#' queryMETA(vcfR_test)
+#' queryMETA(vcfR_test, element = 'DP')
+#' # Note that DP occurs as unique in INFO and FORMAT but they may be different.
+#' 
+#' 
+#' @export
+check_keys <- function(x) {
+  if(class(x) != 'vcfR'){
+    stop( paste('Expecting a vcfR object, instead received:', class(x)) )
+  }
+
+  # First check INFO.
+  myKeys <- grep('INFO', x@meta, value = TRUE)
+  myKeys <- sub('##INFO=<ID=','',myKeys)
+  myKeys <- unlist(lapply(strsplit(myKeys, ','), function(x){x[1]}))
+  myKeys <- table(myKeys)
+  myKeys <- myKeys[myKeys > 1]
+  if( length(myKeys) > 0){
+    warning(paste("The following INFO key occurred more than once:", names(myKeys), '\n'))
+  }
+
+  # Check FORMAT.
+  myKeys <- grep('FORMAT', x@meta, value = TRUE)
+  myKeys <- sub('##FORMAT=<ID=','',myKeys)
+  myKeys <- unlist(lapply(strsplit(myKeys, ','), function(x){x[1]}))
+  myKeys <- table(myKeys)
+  myKeys <- myKeys[myKeys > 1]
+  if( length(myKeys) > 0){
+    warning(paste("The following FORMAT key occurred more than once:", names(myKeys), '\n'))
+  }
+
+}
+
+
+
diff --git a/R/freq_peak_plot.R b/R/freq_peak_plot.R
@@ -7,6 +7,7 @@
 #' Converts allele balance data produced by \code{freq_peak()} to a copy number by assinging the allele balance data (frequencies) to its closest expected ratio.
 #'  
 #' @param pos chromosomal position of variants
+#' @param posUnits units ('bp', 'Kbp', 'Mbp', 'Gbp') for `pos` to be converted to in the main plot
 #' @param ab1 matrix of allele balances for allele 1
 #' @param ab2 matrix of allele balances for allele 2
 #' @param fp1 feq_peak object for allele 1
@@ -26,16 +27,15 @@
 #' The only required information is a vector of chromosomal positions, however this is probably not going to create an interesting plot.
 #' 
 #' 
-#' 
-#' 
-#' 
-#' 
 #' @return An invisible NULL.
 #' 
+#' @seealso 
 #' freq_peak,
 #' peak_to_ploid
 #' 
 #' @examples
+#' 
+#' # An empty plot.
 #' freq_peak_plot(pos=1:40)
 #' 
 #' data(vcfR_example)
@@ -59,6 +59,7 @@
 #' 
 #' @export
 freq_peak_plot <- function(pos, 
+                           posUnits = 'bp',
                            ab1 = NULL, 
                            ab2 = NULL, 
                            fp1 = NULL, 
@@ -81,6 +82,30 @@ freq_peak_plot <- function(pos,
     stop(msg)
   }
 
+  # Handle x-axis units.
+  if( posUnits == 'bp'){
+    # Don't need to do anything.
+  } else if(posUnits == 'Kbp'){
+    pos <- pos/1e3
+    fp1$wins[,'START_pos'] <- fp1$wins[,'START_pos']/1e3
+    fp1$wins[,'END_pos']   <- fp1$wins[,'END_pos']/1e3
+    fp2$wins[,'START_pos'] <- fp2$wins[,'START_pos']/1e3
+    fp2$wins[,'END_pos']   <- fp2$wins[,'END_pos']/1e3
+  } else if(posUnits == 'Mbp'){
+    pos <- pos/1e6
+    fp1$wins[,'START_pos'] <- fp1$wins[,'START_pos']/1e6
+    fp1$wins[,'END_pos']   <- fp1$wins[,'END_pos']/1e6
+    fp2$wins[,'START_pos'] <- fp2$wins[,'START_pos']/1e6
+    fp2$wins[,'END_pos']   <- fp2$wins[,'END_pos']/1e6
+  } else if(posUnits == 'Gbp'){
+    pos <- pos/1e9
+    fp1$wins[,'START_pos'] <- fp1$wins[,'START_pos']/1e9
+    fp1$wins[,'END_pos'] <-   fp1$wins[,'END_pos']/1e9
+    fp2$wins[,'START_pos'] <- fp2$wins[,'START_pos']/1e9
+    fp2$wins[,'END_pos'] <-   fp2$wins[,'END_pos']/1e9
+  }
+  xlabel <- paste('Position (', posUnits, ')', sep = '')  
+
   # Handle color
   col1 <- grDevices::col2rgb(col1, alpha = FALSE)
   col2 <- grDevices::col2rgb(col2, alpha = FALSE)
@@ -101,7 +126,7 @@ freq_peak_plot <- function(pos,
 
   # Initialize plot
   plot( range(pos, na.rm = TRUE), c(0,1), ylim=c(0,1), type="n", yaxt='n', 
-       main = "", xlab = "Position", ylab = "Allele balance")
+       main = "", xlab = xlabel, ylab = "Allele balance")
   graphics::axis(side=2, at=c(0,0.25,0.333,0.5,0.666,0.75,1), 
                  labels=c(0,'1/4','1/3','1/2','2/3','3/4',1), las=1)
   graphics::abline(h=c(0.2,0.25,0.333,0.5,0.666,0.75,0.8), col=8)  

diff --git a/R/genetic_diff.R b/R/genetic_diff.R
@@ -189,6 +189,8 @@ calc_nei <- function(x1, x2){
 #' myPops <- as.factor(rep(c('a','b'), each = 9))
 #' myDiff <- genetic_diff(vcf, myPops, method = "nei")
 #' colMeans(myDiff[,c(3:8,11)], na.rm = TRUE)
+#' hist(myDiff$Gprimest, xlab = expression(italic("G'"["ST"])), 
+#'      col='skyblue', breaks = seq(0, 1, by = 0.01))
 #' 
 #'
 genetic_diff <- function(vcf, pops, method = "nei"){