ncborcherding · ncborcherding · Aug 27, 2024 · Mar 25, 2024 · Mar 25, 2024 · Apr 4, 2024
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -1,3 +1,5 @@
 ^\.github$
 ^www$
 ^codecov\.yml$
+^.*\.Rproj$
+^\.Rproj\.user$
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -18,8 +18,6 @@ jobs:
       fail-fast: false
       matrix:
         config:
-          - {os: macos-latest,   r: 'release'}
-          - {os: windows-latest, r: 'release'}
           - {os: ubuntu-latest,   r: 'release'}
 
     env:
@@ -46,7 +44,7 @@ jobs:
           needs: check
 
       - name: Add GSVA repo
-        run: Rscript -e 'remotes::install_github("rcastelo/GSVA")'
+        run: Rscript -e 'remotes::install_github("rcastelo/GSVA@27d70c068f12f922e5ca2f363626089310dc2a2b")'
 
       - uses: r-lib/actions/check-r-package@v2
         with:

diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -30,13 +30,14 @@ jobs:
         run: Rscript -e 'install.packages("remotes")'
 
       - name: Add GSVA repo
-        run: Rscript -e 'remotes::install_github("rcastelo/GSVA")'
+        run: Rscript -e 'remotes::install_github("rcastelo/GSVA@27d70c068f12f922e5ca2f363626089310dc2a2b")'
 
       - name: Test coverage
         run: |
           covr::codecov(
             quiet = FALSE,
             clean = FALSE,
+            exclusions = "R/global.R",
             install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package")
           )
         shell: Rscript {0}

diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,6 @@
-
+.Rbuildignore
 .DS_Store
+.RHistory
+escape.Rproj
+.Rproj*
+.RData
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,15 +1,17 @@
 Package: escape
 Title: Easy single cell analysis platform for enrichment
-Version: 1.99.1
-Date: 2024-02-29
+Version: 2.0.1
+Date: 2024-07-26
 Authors@R: c(
     person(given = "Nick", family = "Borcherding", role = c("aut", "cre"), email = "[email protected]"),
-    person(given = "Jared", family = "Andrews", role = c("aut"), email = "[email protected]"))
+    person(given = "Jared", family = "Andrews", role = c("aut"), email = "[email protected]"),
+    person(given = "Alexei", family = "Martsinkovskiy", role = c("ctb"), email = "[email protected]")
+    )
 Description: A bridging R package to facilitate gene set enrichment analysis (GSEA) in the context of single-cell RNA sequencing. Using raw count information, Seurat objects, or SingleCellExperiment format, users can perform and visualize ssGSEA, GSVA, AUCell, and UCell-based enrichment calculations across individual cells. 
 License: MIT + file LICENSE 
 Encoding: UTF-8
 LazyData: false
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 biocViews: Software, SingleCell, Classification, Annotation, GeneSetEnrichment, Sequencing, GeneSignaling, Pathways
 Depends: R (>= 4.1)
 Imports: 
@@ -34,7 +36,8 @@ Imports:
     UCell,
     stringr,
     methods,
-    SeuratObject
+    SeuratObject,
+    Matrix
 Suggests: 
     Seurat,
     hexbin,

diff --git a/NAMESPACE b/NAMESPACE
@@ -27,6 +27,7 @@ importFrom(GSVA,gsvaParam)
 importFrom(GSVA,ssgseaParam)
 importFrom(MatrixGenerics,rowSds)
 importFrom(MatrixGenerics,rowSums2)
+importFrom(SeuratObject,Assays)
 importFrom(SeuratObject,CreateAssayObject)
 importFrom(SeuratObject,CreateDimReducObject)
 importFrom(SeuratObject,Idents)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# escape VERSION 2.0.1 (2024-07-26)
+
+## UNDERLYING CHANGES
+
+* fixed ```performNormalziation()``` errors when input.data was a matrix, now requires single-cell object and enrichment data
+* passing parallel processing properly to ```runEscape()``` function.
+
 # escape VERSION 1.99.1 (2024-02-29)
 
 ## UNDERLYING CHANGES

diff --git a/R/performNormalization.R b/R/performNormalization.R
@@ -7,15 +7,23 @@
 #' evaluation for log2-fold change, but will alter the original 
 #' enrichment values.
 #' 
-#' @param input.data Enrichment output from \code{\link{escape.matrix}} or
-#' \code{\link{runEscape}}.
-#' @param assay Name of the assay to plot if data is a single-cell object.
+#' @param sc.data Single-cell object or matrix used in the gene set enrichment calculation in 
+#' \code{\link{escape.matrix}} or \code{\link{runEscape}}.
+#' @param enrichment.data The enrichment results from \code{\link{escape.matrix}} 
+#' or \code{\link{runEscape}} (optional)
+#' @param assay Name of the assay to normalize if using a single-cell object
 #' @param gene.sets The gene set library to use to extract 
-#' the individual gene set information from.
+#' the individual gene set information from
 #' @param scale.factor A vector to use for normalizing enrichment scores per cell.
 #' @param make.positive Shift enrichment values to a positive range \strong{TRUE}
 #' for downstream analysis or not \strong{TRUE} (default).
-#' 
+#' @param groups the number of cells to calculate normalization on at once.
+#' chunks matrix into groups sized chunks. Useful in case of memory issues.
+#' @importFrom stringr str_replace_all
+#' @importFrom SeuratObject Assays
+#' @importFrom SummarizedExperiment assays
+#' @importFrom Matrix colSums
+
 #' @examples
 #' GS <- list(Bcells = c("MS4A1", "CD79B", "CD79A", "IGH1", "IGH2"),
 #'            Tcells = c("CD3E", "CD3D", "CD3G", "CD7","CD8A"))
@@ -29,23 +37,37 @@
 #'                                    gene.sets = GS)
 #'
 #' @export
-#' 
 #' @return Single-cell object or matrix of normalized enrichment scores
 
-performNormalization <- function(input.data,
-                                 assay = NULL,
-                                 gene.sets = NULL,
-                                 make.positive = FALSE,
-                                 scale.factor = NULL) {
+
+
+
+
+performNormalization <- function(sc.data,
+                                        enrichment.data = NULL,
+                                        assay = "escape",
+                                        gene.sets = NULL,
+                                        make.positive = FALSE,
+                                        scale.factor = NULL,
+                                        groups = NULL) {
+  if(!is.null(assay)) {
+    if(is_seurat_object(sc.data)) {
+      assay.present <- assay %in% Assays(sc.data)
+    } else if (is_se_object(sc.data)) {
+      assay.present <- assay %in% assays(sc.data)
+    }
+  } else {
+    assay.present <- FALSE
+  }
 
 
-  if(is_seurat_or_se_object(input.data)) {
-    enriched <- .pull.Enrich(input.data, assay)
+  if(is_seurat_or_se_object(sc.data) & !is.null(assay) & assay.present) {
+    enriched <- .pull.Enrich(sc.data, assay)
   } else {
-    enriched <- input.data
+    enriched <- enrichment.data
   }
 
-  if(!is.null(scale.factor) & length(scale.factor) != dim(input.data)[2]) {
+  if(!is.null(scale.factor) & length(scale.factor) != dim(sc.data)[2]) {
     stop("If using a vector as a scale factor, please ensure the length matches the number of cells.")
   }
 
@@ -55,54 +77,48 @@ performNormalization <- function(input.data,
   egc <- egc[names(egc) %in% colnames(enriched)]
 
   #Isolating the number of genes per cell expressed
-  cnts <- .cntEval(input.data, assay = "RNA", type = "counts")
+  if(is.null(groups)){
+    chunks <- dim(enriched)[[1]]
+  }
+  else{
+    chunks <- min(groups, dim(enriched)[[1]])
+  }
 
-  if(is.null(scale.factor)) {
+  if (is.null(scale.factor)) {
+    cnts <- .cntEval(sc.data, assay = "RNA", type = "counts")
     print("Calculating features per cell...")
-
-    # Pre-compute which genes are non-zero in each sample
-    non_zero_indices <- lapply(seq_len(ncol(cnts)), function(y) {
-      which(cnts[, y] != 0)
-    })
-
-    # Convert gene sets to a list of indices
-    egc_indices <- lapply(egc, function(x) {
-      which(rownames(cnts) %in% x)
-    })
-
-    egc.size <- lapply(egc_indices, function(gene_set_indices) {
-      sapply(non_zero_indices, function(sample_indices) {
-        length(intersect(sample_indices, gene_set_indices))
-      })
+    egc.sizes <- lapply(egc, function(x){
+      scales<-unname(Matrix::colSums(cnts[which(rownames(cnts) %in% x),]!=0))
+      scales[scales==0] <- 1
+      scales
     })
+    egc.sizes <- split_rows(do.call(cbind,egc.sizes), chunk.size=chunks)
+    rm(cnts)
   }
+  else{
+    egc.sizes <- split_vector(scale.factor, chunk.size=chunks)
+  }
+  enriched <- split_rows(enriched, chunk.size=chunks)
 
   print("Normalizing enrichment scores per cell...")
   #Dividing the enrichment score by number of genes expressed
-  lapply(seq_len(ncol(enriched)), function(x) {
-    if (!is.null(scale.factor)) {
-      enriched[,x] <- enriched[,x]/scale.factor
-    } else {
-      gene.set <- unlist(egc.size[colnames(enriched)[x]])
-      if(any(gene.set == 0)) {
-        gene.set[which(gene.set == 0)] <- 1
-      }
-      enriched[,x] <- enriched[,x]/gene.set
-    }
-    if(any(enriched[,x] < 0) & make.positive) {
-      enriched[,x] <- enriched[,x] + abs(min(enriched[,x]))
+
+  enriched<-mapply(function(scores, scales){
+    scores/scales
+  }, enriched, egc.sizes, SIMPLIFY = FALSE)
+  enriched <- do.call(rbind, enriched)
+  if(make.positive){
+    enriched <- apply(enriched, 2, function(x){
+      x+max(0, -min(x))
+    })
+  }
+  if(is_seurat_or_se_object(sc.data)) {
+    if(is.null(assay)) {
+      assay <- "escape"
     }
-    enriched[,x]
-  }) -> normalized.values
-
-  normalized.enriched <- do.call(cbind, normalized.values)
-  colnames(normalized.enriched) <- colnames(enriched)
-
-  if(is_seurat_or_se_object(input.data)) {
-    input.data <- .adding.Enrich(input.data, normalized.enriched, paste0(assay, "_normalized"))
-    return(input.data)
+    sc.data <- .adding.Enrich(sc.data, enriched, paste0(assay, "_normalized"))
+    return(sc.data)
   } else {
-    return(normalized.enriched)
+    return(enriched)
   }
-
-}
+}
diff --git a/R/runEscape.R b/R/runEscape.R
@@ -100,14 +100,6 @@ escape.matrix <- function(input.data,
                                      aucMaxRank = ceiling(0.2 * nrow(split.data[[i]])),
                                      verbose = FALSE,
                                      ...))
-
-            # a <- t(assay(suppressWarnings(
-            #               AUCell_run(exprMat = split.data[[i]], 
-            #                          geneSets = egc,
-            #                          normAUC = TRUE,
-            #                          BPPARAM = BPPARAM,
-            #                          aucMaxRank = ceiling(0.2 * nrow(split.data[[i]])),
-            #                          ...))))
 
           }
           scores[[i]] <- a
@@ -117,10 +109,12 @@ escape.matrix <- function(input.data,
 
     #Normalize based on dropout
     if(normalize) {
-      output <- performNormalization(output,
+      output <- performNormalization(sc.data = input.data,
+                                     enrichment.data = output,
                                      assay = NULL,
                                      gene.sets = gene.sets,
-                                     make.positive = make.positive)
+                                     make.positive = make.positive,
+                                     groups = groups)
     }
     return(output)
 }
@@ -175,7 +169,8 @@ runEscape <- function(input.data,
                               gene.sets = gene.sets,
                               method = method,
                               groups = groups,
-                              min.size = min.size)
+                              min.size = min.size,
+                              BPPARAM = BPPARAM)
 
   input.data <- .adding.Enrich(input.data, enrichment, new.assay.name)
   return(input.data)

diff --git a/R/utils.R b/R/utils.R
@@ -183,7 +183,7 @@ is_seurat_or_se_object <- function(obj) {
     new.assay <- suppressWarnings(CreateAssayObject(
                                   data = as.matrix(t(enrichment))))
 
-    sc[[enrichment.name]] <- new.assay
+    suppressWarnings(sc[[enrichment.name]] <- new.assay)
   } else if (inherits(sc, "SingleCellExperiment")) {
     altExp(sc, enrichment.name) <- SummarizedExperiment(assays = t(enrichment))
     names(assays(altExp(sc, enrichment.name))) <- enrichment.name
@@ -254,4 +254,51 @@ is_seurat_or_se_object <- function(obj) {
   return(values)
 }
 
+#function to split matrices by row
+#adopted from ucells split_data.matrix
+split_rows <- function (matrix, chunk.size = 1000) 
+{
+  nrows <- dim(matrix)[1]
+  if(is.vector(matrix)){
+    nrows <- length(matrix)
+  }
+  nchunks <- (nrows - 1)%/%chunk.size + 1
+  split.data <- list()
+  min <- 1
+  for (i in seq_len(nchunks)) {
+    if (i == nchunks - 1) {
+      left <- nrows - (i - 1) * chunk.size
+      max <- min + round(left/2) - 1
+    }
+    else {
+      max <- min(i * chunk.size, nrows)
+    }
+    split.data[[i]] <- matrix[min:max,]
+    min <- max + 1
+  }
+  return(split.data)
+}
+#function to split vector
+#adopted from ucells split_data.matrix
+split_vector <- function (vector, chunk.size = 1000) 
+{
+
+  nrows <- length(vector)
+  nchunks <- (nrows - 1)%/%chunk.size + 1
+  split.data <- list()
+  min <- 1
+  for (i in seq_len(nchunks)) {
+    if (i == nchunks - 1) {
+      left <- nrows - (i - 1) * chunk.size
+      max <- min + round(left/2) - 1
+    }
+    else {
+      max <- min(i * chunk.size, nrows)
+    }
+    split.data[[i]] <- vector[min:max]
+    min <- max + 1
+  }
+  return(split.data)
+}
+