diff --git a/DESCRIPTION b/DESCRIPTION
index a8da22e..ca90bd8 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -24,6 +24,7 @@ Imports:
ggplot2,
ggrepel,
ggvenn,
+ glue,
impute,
limma,
mice,
@@ -32,13 +33,13 @@ Imports:
pheatmap,
psych,
readxl,
+ scales,
seqinr,
softImpute,
tidyr (>= 1.3.0),
tibble,
tictoc,
- utils,
- visdat
+ utils
Suggests:
roxyglobals,
testthat (>= 3.0.0)
diff --git a/NAMESPACE b/NAMESPACE
index ac43cde..a549c57 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -39,6 +39,7 @@ importFrom(Rdpack,reprompt)
importFrom(UpSetR,fromList)
importFrom(UpSetR,upset)
importFrom(data.table,fread)
+importFrom(glue,glue)
importFrom(impute,impute.knn)
importFrom(limma,normalizeQuantiles)
importFrom(mice,complete)
@@ -48,6 +49,7 @@ importFrom(pcaMethods,completeObs)
importFrom(pcaMethods,pca)
importFrom(psych,describeBy)
importFrom(readxl,read_excel)
+importFrom(scales,percent)
importFrom(seqinr,getName)
importFrom(seqinr,read.fasta)
importFrom(seqinr,write.fasta)
@@ -69,4 +71,3 @@ importFrom(tibble,column_to_rownames)
importFrom(tibble,rownames_to_column)
importFrom(utils,read.csv)
importFrom(utils,write.csv)
-importFrom(visdat,vis_miss)
diff --git a/R/dataMissing.R b/R/dataMissing.R
index 95a1475..fd8d5f7 100644
--- a/R/dataMissing.R
+++ b/R/dataMissing.R
@@ -27,20 +27,33 @@
#'
#' @param dataSet The 2d data set of experimental values.
#'
+#' @param sort_miss A boolean (default = FALSE) specifying whether to arrange the columns
+#' in order of missingness.
+#'
#' @param plot A boolean (default = FALSE) specifying whether to plot the missingness.
#'
+#' @param show_pct_legend A boolean (default = TRUE) specifying whether the percentages of
+#' missing and present values in the entire dataset are shown in the legend of the
+#' visualization when \code{plot = TRUE}.
+#'
#' @param show_labels A boolean (default = TRUE) specifying whether protein names are
#' shown in the visualization when \code{plot = TRUE}.
#'
+#' @param show_pct_col A boolean (default = TRUE) specifying whether the percentages of
+#' missing data in the samples for that protein are shown in the labels of the
+#' visualization when \code{show_labels = TRUE}.
+#'
#' @import dplyr
#' @import ggplot2
-#' @importFrom visdat vis_miss
+#' @import tidyr
+#' @importFrom glue glue
+#' @importFrom scales percent
#'
#' @returns A 2d dataframe including:
#' \itemize{
#' \item "count_miss": The count of missing values for each protein.
-#' \item "pct-miss": The percentage of missing values for each protein.
-#' \item "pct_total_miss": The percentage of missing values for each protein relative to
+#' \item "pct_miss_col": The percentage of missing values for each protein.
+#' \item "pct_miss_tot": The percentage of missing values for each protein relative to
#' the total missing values in the entire dataset.
#' }
#'
@@ -48,24 +61,81 @@
#'
#' @export
-dataMissing <- function(dataSet, plot = FALSE, show_labels = TRUE) {
+dataMissing <- function(dataSet, sort_miss = FALSE,
+ plot = FALSE, show_pct_legend = TRUE,
+ show_labels = TRUE, show_pct_col = TRUE) {
+
dataMissing <- select(dataSet, -c(R.Condition, R.Replicate))
- if (plot == TRUE) {
- if (show_labels == TRUE) {
- plot <- visdat::vis_miss(dataMissing)
+
+ if (sort_miss) {
+ dataMissing <- dataMissing[,names(sort(colSums(is.na(dataMissing)), decreasing = TRUE))]
+ }
+
+ if (plot) {
+
+ plotdf <- dataMissing %>%
+ mutate(row = row_number()) %>%
+ pivot_longer(cols = -row, names_to = "variable", values_to = "value",
+ values_transform = list(value = is.na))
+
+ if (show_pct_legend) {
+ pct_missing <- mean(is.na(dataMissing))*100
+ if (pct_missing == 0) {
+ lab_missing <- "No Missing Values"
+ lab_present <- "Present (100%)"
+ } else if (pct_missing < 0.1) {
+ lab_missing <- "Missing (< 0.1%)"
+ lab_present <- "Present (> 99.9%)"
+ } else {
+ pct_missing <- round(pct_missing, 1)
+ pct_present <- 100 - pct_missing
+ lab_missing <- glue::glue("Missing\n({pct_missing}%)")
+ lab_present <- glue::glue("Present\n({pct_present}%)")
+ }
+ } else {
+ lab_missing <- "Missing"
+ lab_present <- "Present"
+ }
+
+ plot <- ggplot(plotdf, aes(x = variable, y = row)) +
+ geom_raster(aes(fill = value)) +
+ scale_fill_manual(name = "", breaks = c("TRUE", "FALSE"),
+ values = c("grey20", "grey80"),
+ labels = c(lab_missing, lab_present)) +
+ scale_y_reverse() +
+ theme_minimal() +
+ labs(x = "", y = "Observations") +
+ theme(legend.position = "bottom",
+ axis.text.x = element_text(angle = 45, hjust = 0))
+
+ if (show_labels) {
+ if (show_pct_col) {
+ lab_pct_miss_col <- colMeans(is.na(dataMissing)) %>%
+ sapply(function(x) {
+ case_when(x == 0 ~ "0%",
+ x < 0.001 ~ "<0.1%",
+ x < 0.01 ~ "<1%",
+ x >= 0.01 ~ scales::percent(x, accuracy = 1))
+ })
+ plot <- plot +
+ scale_x_discrete(position = "top", limits = names(dataMissing),
+ labels = glue::glue("{names(lab_pct_miss_col)} ({lab_pct_miss_col})"))
+ } else {
+ plot <- plot +
+ scale_x_discrete(position = "top", limits = names(dataMissing))
+ }
} else {
- plotData <- dataMissing
- colnames(plotData) <- sprintf("%0*d", nchar(ncol(dataMissing)), 1:ncol(dataMissing))
- plot <- visdat::vis_miss(plotData) +
- ggplot2::scale_x_discrete(labels = element_blank())
+ plot <- plot +
+ scale_x_discrete(position = "top", labels = element_blank())
}
+
print(plot)
}
count_miss <- colSums(is.na(dataMissing))
result <- data.frame(count_miss,
- pct_miss = count_miss/nrow(dataMissing)*100,
- pct_total_miss = count_miss/sum(count_miss)*100)
+ pct_miss_col = colMeans(is.na(dataMissing))*100,
+ pct_miss_tot = count_miss/sum(count_miss)*100)
return(as.data.frame(t(result)))
}
diff --git a/R/globals.R b/R/globals.R
index 2789095..b615df7 100644
--- a/R/globals.R
+++ b/R/globals.R
@@ -5,6 +5,8 @@ utils::globalVariables(c(
"R.Replicate", #
Statistical
Consulting Services, UConn
vignettes/scaffold.Rmd
scaffold.Rmd
vignettes/usage_template.Rmd
usage_template.Rmd