diff --git a/DESCRIPTION b/DESCRIPTION index d8ec696b..1f9a9460 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -75,15 +75,12 @@ Suggests: ggrepel, gprofiler2, IRanges, - irlba, knitr, org.Hs.eg.db, plotly, psych, RcppPlanc, reactome.db, - rgl, - rglwidget, rmarkdown, sankey, scattermore (>= 0.7), diff --git a/NAMESPACE b/NAMESPACE index 84bd3aaa..8a66e2e2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,8 @@ S3method(plotSpatial2D,liger) S3method(plotSpatial2D,ligerSpatialDataset) S3method(quantileNorm,Seurat) S3method(quantileNorm,liger) +S3method(runCINMF,Seurat) +S3method(runCINMF,liger) S3method(runINMF,Seurat) S3method(runINMF,liger) S3method(runIntegration,Seurat) @@ -156,6 +158,7 @@ export(restoreH5Liger) export(restoreOnlineLiger) export(retrieveCellFeature) export(reverseMethData) +export(runCINMF) export(runCluster) export(runDoubletFinder) export(runGOEnrich) diff --git a/R/RcppExports.R b/R/RcppExports.R index 127b8e2b..874af8ed 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -5,6 +5,14 @@ RunModularityClusteringCpp <- function(SNN, modularityFunction, resolution, algo .Call(`_rliger2_RunModularityClusteringCpp`, SNN, modularityFunction, resolution, algorithm, nRandomStarts, nIterations, randomSeed, printOutput, edgefilename) } +colNormalize_dense_cpp <- function(x, L) { + .Call(`_rliger2_colNormalize_dense_cpp`, x, L) +} + +colAggregateMedian_dense_cpp <- function(x, group, n) { + .Call(`_rliger2_colAggregateMedian_dense_cpp`, x, group, n) +} + scaleNotCenter_byRow_rcpp <- function(x) { .Call(`_rliger2_scaleNotCenter_byRow_rcpp`, x) } diff --git a/R/archive.R b/R/archive.R index ce07d9a6..41c7ca8e 100644 --- a/R/archive.R +++ b/R/archive.R @@ -107,126 +107,3 @@ # featureIdx # return(ld) # } - -#' #' Perform Wilcoxon rank-sum test -#' #' @description Perform Wilcoxon rank-sum tests on specified dataset using -#' #' given method. -#' #' @param object A \linkS4class{liger} object with cluster assignment available. -#' #' @param useDatasets A character vector of the names, a numeric or logical -#' #' vector of the index of the datasets to be normalized. Default -#' #' \code{NULL} uses all datasets. -#' #' @param method Choose from \code{"clusters"} or \code{"datasets"}. Default -#' #' \code{"clusters"} compares between clusters across all datasets, while -#' #' \code{"datasets"} compares between datasets within each cluster. -#' #' @param useCluster The name of the column in \code{cellMeta} slot storing the -#' #' cluster assignment variable. Default \code{"leiden_cluster"} -#' #' @param usePeak Logical, whether to test peak counts instead of gene -#' #' expression. Requires presence of ATAC modility datasets. Default -#' #' \code{FALSE}. -#' #' @param verbose Logical. Whether to show information of the progress. Default -#' #' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. -#' #' @param data.use,compare.method \bold{Deprecated}. See Usage section for -#' #' replacement. -#' #' @return A 10-columns data.frame with test results. -#' #' @export -#' #' @examples -#' #' library(dplyr) -#' #' result <- runWilcoxon(pbmcPlot) -#' #' result %>% group_by(group) %>% top_n(2, logFC) -#' runWilcoxon <- function( -#' object, -#' useDatasets = NULL, -#' method = c("clusters", "datasets"), -#' useCluster = "leiden_cluster", -#' usePeak = FALSE, -#' verbose = getOption("ligerVerbose"), -#' # Deprecated coding style, -#' data.use = useDatasets, -#' compare.method = method -#' ) { -#' .deprecateArgs(list(data.use = "useDatasets", compare.method = "method")) -#' method <- match.arg(method) -#' # Input checks #### -#' useDatasets <- .checkUseDatasets(object, useDatasets, -#' modal = ifelse(usePeak, "atac", "default")) -#' if (!isTRUE(usePeak)) { -#' if (method == "datasets" & length(useDatasets) < 2) -#' stop("Should have at least 2 datasets as input ", -#' "when compare between datasets") -#' if (isH5Liger(object, useDatasets)) { -#' stop("HDF5 based datasets detected but is not supported. \n", -#' "Try `object.sub <- downsample(object, useSlot = ", -#' "'normData')` to create ANOTHER object with in memory data.") -#' } -#' allNormed <- all(sapply(datasets(object), -#' function(ld) !is.null(normData(ld)))) -#' if (!allNormed) -#' stop("All datasets being involved has to be normalized") -#' -#' ## get all shared genes of datasets involved -#' normDataList <- getMatrix(object, "normData", dataset = useDatasets, -#' returnList = TRUE) -#' features <- Reduce(intersect, lapply(normDataList, rownames)) -#' normDataList <- lapply(normDataList, function(x) x[features,]) -#' featureMatrix <- Reduce(cbind, normDataList) -#' } else { -#' if (method == "datasets" || length(useDatasets) != 1) -#' stop("For wilcoxon test on peak counts, can only use ", -#' "\"cluster\" method on one dataset.") -#' normPeakList <- lapply(useDatasets, function(d) normPeak(object, d)) -#' features <- Reduce(intersect, lapply(normPeakList, rownames)) -#' featureMatrix <- Reduce(cbind, normPeakList) -#' #featureMatrix <- normPeak(object, useDatasets) -#' if (is.null(featureMatrix)) -#' stop("Peak counts of specified dataset has to be normalized. ", -#' "Please try `normalizePeak(object, useDatasets = '", -#' useDatasets, "')`.") -#' #features <- rownames(featureMatrix) -#' } -#' -#' ## Subset metadata involved -#' cellIdx <- object$dataset %in% useDatasets -#' cellSource <- object$dataset[cellIdx] -#' clusters <- .fetchCellMetaVar(object, useCluster, cellIdx = cellIdx, -#' checkCategorical = TRUE) -#' -#' if (isTRUE(verbose)) -#' .log("Performing Wilcoxon test on ", length(useDatasets), " datasets: ", -#' paste(useDatasets, collapse = ", ")) -#' # perform wilcoxon test #### -#' if (method == "clusters") { -#' # compare between clusters across datasets -#' nfeatures <- length(features) -#' if (nfeatures > 100000) { -#' if (isTRUE(verbose)) .log("Calculating Large-scale Input...") -#' results <- Reduce(rbind, lapply( -#' suppressWarnings(split(seq(nfeatures), -#' seq(nfeatures / 100000))), -#' function(index) { -#' fm <- log1p(1e10*featureMatrix[index, ]) -#' wilcoxauc(fm, clusters) -#' })) -#' } else { -#' # TODO: If we add log-transformation to normalization method in the -#' # future, remember to have conditions here. -#' fm <- log1p(1e10*featureMatrix) -#' results <- wilcoxauc(fm, clusters) -#' } -#' } else { -#' # compare between datasets within each cluster -#' results <- Reduce(rbind, lapply(levels(clusters), function(cluster) { -#' clusterIdx <- clusters == cluster -#' subLabel <- paste0(cluster, "-", cellSource[clusterIdx]) -#' if (length(unique(subLabel)) == 1) { -#' # if cluster has only 1 data source -#' warning("Skipped Cluster ", cluster, -#' " since it has only one dataset source.") -#' return() -#' } else { -#' subMatrix <- log1p(1e10*featureMatrix[, clusterIdx]) -#' return(wilcoxauc(subMatrix, subLabel)) -#' } -#' })) -#' } -#' return(results) -#' } diff --git a/R/cINMF.R b/R/cINMF.R new file mode 100644 index 00000000..b8d50138 --- /dev/null +++ b/R/cINMF.R @@ -0,0 +1,595 @@ +#' Perform consensus iNMF on scaled datasets +#' @description +#' Performs consensus integrative non-negative matrix factorization (c-iNMF) +#' to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. We run the +#' regular iNMF multiple times with different random starts, and then take the +#' consensus of frequently appearing factors from gene loading matrices, \eqn{W} +#' and \eqn{V}. The cell factor loading \eqn{H} matrices are eventually solved +#' with the consensus \eqn{W} and \eqn{V} matrices. +#' +#' Please see \code{\link{runINMF}} for detailed introduction to the regular +#' iNMF algorithm which is run multiple times in this function. +#' +#' The consensus iNMF algorithm is developed basing on the consensus NMF (cNMF) +#' method (D. Kotliar et al., 2019). +#' @param object A \linkS4class{liger} object or a Seurat object with +#' non-negative scaled data of variable features (Done with +#' \code{\link{scaleNotCenter}}). +#' @param k Inner dimension of factorization (number of factors). Generally, a +#' higher \code{k} will be needed for datasets with more sub-structure. Default +#' \code{20}. +#' @param lambda Regularization parameter. Larger values penalize +#' dataset-specific effects more strongly (i.e. alignment should increase as +#' \code{lambda} increases). Default \code{5}. +#' @param rho Numeric number between 0 and 1. Fraction for determining the +#' number of nearest neighbors to look at for consensus (by +#' \code{rho * nRandomStarts}). Default \code{0.3}. +#' @param nIteration Total number of block coordinate descent iterations to +#' perform. Default \code{30}. +#' @param nRandomStarts Number of replicate runs for creating the pool of +#' factorization results. Default \code{10}. +#' @param HInit Initial values to use for \eqn{H} matrices. A list object where +#' each element is the initial \eqn{H} matrix of each dataset. Default +#' \code{NULL}. +#' @param WInit Initial values to use for \eqn{W} matrix. A matrix object. +#' Default \code{NULL}. +#' @param VInit Initial values to use for \eqn{V} matrices. A list object where +#' each element is the initial \eqn{V} matrix of each dataset. Default +#' \code{NULL}. +#' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. +#' @param verbose Logical. Whether to show information of the progress. Default +#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. +#' @param ... Arguments passed to methods. +#' @rdname runCINMF +#' @export +#' @return +#' \itemize{ +#' \item{liger method - Returns updated input \linkS4class{liger} object +#' \itemize{ +#' \item{A list of all \eqn{H} matrices can be accessed with +#' \code{getMatrix(object, "H")}} +#' \item{A list of all \eqn{V} matrices can be accessed with +#' \code{getMatrix(object, "V")}} +#' \item{The \eqn{W} matrix can be accessed with +#' \code{getMatrix(object, "W")}} +#' }} +#' \item{Seurat method - Returns updated input Seurat object +#' \itemize{ +#' \item{\eqn{H} matrices for all datasets will be concatenated and +#' transposed (all cells by k), and form a DimReduc object in the +#' \code{reductions} slot named by argument \code{reduction}.} +#' \item{\eqn{W} matrix will be presented as \code{feature.loadings} in the +#' same DimReduc object.} +#' \item{\eqn{V} matrices, an objective error value and the dataset +#' variable used for the factorization is currently stored in +#' \code{misc} slot of the same DimReduc object.} +#' }} +#' } +#' @references +#' Joshua D. Welch and et al., Single-Cell Multi-omic Integration Compares and +#' Contrasts Features of Brain Cell Identity, Cell, 2019 +#' +#' Dylan Kotliar and et al., Identifying gene expression programs of cell-type +#' identity and cellular activity with single-cell RNA-Seq, eLife, 2019 +#' @examples +#' pbmc <- normalize(pbmc) +#' pbmc <- selectGenes(pbmc) +#' pbmc <- scaleNotCenter(pbmc) +#' if (requireNamespace("RcppPlanc", quietly = TRUE)) { +#' pbmc <- runCINMF(pbmc) +#' } +runCINMF <- function( + object, + k = 20, + lambda = 5.0, + rho = 0.3, + ... +) { + UseMethod("runCINMF", object) +} + +#' @rdname runCINMF +#' @export +#' @method runCINMF liger +runCINMF.liger <- function( + object, + k = 20, + lambda = 5.0, + rho = 0.3, + nIteration = 30, + nRandomStarts = 10, + HInit = NULL, + WInit = NULL, + VInit = NULL, + seed = 1, + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), + ... +) { + .checkObjVersion(object) + object <- recordCommand(object, ..., dependencies = "RcppPlanc") + object <- removeMissing(object, orient = "cell", verbose = verbose) + data <- lapply(datasets(object), function(ld) { + if (is.null(scaleData(ld))) + cli::cli_abort("Scaled data not available. Run {.fn scaleNotCenter} first.") + return(scaleData(ld)) + }) + dataClasses <- sapply(data, function(x) class(x)[1]) + if (!all(dataClasses == dataClasses[1])) { + cli::cli_abort("Currently the {.field scaledData} of all datasets have to be of the same class.") + } + out <- .runCINMF.list( + object = data, + k = k, + lambda = lambda, + rho = rho, + nIteration = nIteration, + nRandomStarts = nRandomStarts, + HInit = HInit, + WInit = WInit, + VInit = VInit, + seed = seed, + nCores = nCores, + verbose = verbose + ) + # return(out) + object@W <- out$W + for (d in names(object)) { + object@datasets[[d]]@H <- out$H[[d]] + object@datasets[[d]]@V <- out$V[[d]] + # ld <- dataset(object, d) + # ld@H <- out$H[[d]] + # ld@V <- out$V[[d]] + # datasets(object, check = FALSE)[[d]] <- ld + } + object@uns$factorization <- list(k = k, lambda = lambda) + return(object) +} + +#' @rdname runCINMF +#' @export +#' @param datasetVar Metadata variable name that stores the dataset source +#' annotation. Default \code{"orig.ident"}. +#' @param layer For Seurat>=4.9.9, the name of layer to retrieve input +#' non-negative scaled data. Default \code{"ligerScaleData"}. For older Seurat, +#' always retrieve from \code{scale.data} slot. +#' @param assay Name of assay to use. Default \code{NULL} uses current active +#' assay. +#' @param reduction Name of the reduction to store result. Also used as the +#' feature key. Default \code{"cinmf"}. +#' @method runCINMF Seurat +runCINMF.Seurat <- function( + object, + k = 20, + lambda = 5.0, + rho = 0.3, + datasetVar = "orig.ident", + layer = "ligerScaleData", + assay = NULL, + reduction = "cinmf", + nIteration = 30, + nRandomStarts = 10, + HInit = NULL, + WInit = NULL, + VInit = NULL, + seed = 1, + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), + ... +) { + assay <- assay %||% SeuratObject::DefaultAssay(object) + Es <- .getSeuratData(object, layer = layer, slot = "scale.data", + assay = assay) + # the last [,1] converts data.frame to the vector/factor + datasetVar <- object[[datasetVar]][,1] + if (!is.factor(datasetVar)) datasetVar <- factor(datasetVar) + datasetVar <- droplevels(datasetVar) + + if (!is.list(Es)) { + Es <- splitRmMiss(Es, datasetVar) + Es <- lapply(Es, methods::as, Class = "CsparseMatrix") + } + + for (i in seq_along(Es)) { + if (any(Es[[i]]@x < 0)) { + cli::cli_abort( + c("x" = "Negative data encountered for integrative {.emph Non-negative} Matrix Factorization.", + "!" = "Please run {.fn scaleNotCenter} first.") + ) + } + } + + res <- .runCINMF.list( + object = Es, + k = k, + lambda = lambda, + rho = rho, + nIteration = nIteration, + nRandomStarts = nRandomStarts, + HInit = HInit, + WInit = WInit, + VInit = VInit, + seed = seed, + nCores = nCores, + verbose = verbose + ) + Hconcat <- t(Reduce(cbind, res$H)) + colnames(Hconcat) <- paste0(reduction, "_", seq_len(ncol(Hconcat))) + object[[reduction]] <- Seurat::CreateDimReducObject( + embeddings = Hconcat, + loadings = res$W, + assay = assay, + misc = list(V = res$V, objErr = res$objErr, dataset = datasetVar) + ) + return(object) +} + +#' @param barcodeList List object of barcodes for each datasets, for setting +#' dimnames of output \eqn{H} matrices. Default \code{NULL} uses \code{colnames} +#' of matrices in the \code{object}. +#' @param features Character vector of feature names, for setting dimnames of +#' output \eqn{V} and \eqn{W} matrices. Default \code{NULL} uses \code{rownames} +#' of matrices in the \code{object}. +#' @return The list method returns a list of entries \code{H}, \code{V} and +#' \code{W}. \code{H} is a list of \eqn{H} matrices for each dataset. \code{V} +#' is a list of \eqn{V} matrices for each dataset. \code{W} is the shared +#' \eqn{W} matrix. +#' @noRd +.runCINMF.list <- function( + object, + k = 20, + lambda = 5.0, + rho = 0.3, + nIteration = 30, + nRandomStarts = 10, + HInit = NULL, + WInit = NULL, + VInit = NULL, + seed = 1, + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE) +) { + if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start + cli::cli_abort( + "Package {.pkg RcppPlanc} is required for c-iNMF integration. + Please install it by command: + {.code devtools::install_github('welch-lab/RcppPlanc')}") # nocov end + if (nRandomStarts <= 1) { + cli::cli_abort("{.var nRandomStarts} must be greater than 1 for taking the consensus.") + } + if (rho <= 0 || rho > 1) { + cli::cli_abort(c("Invalid value for {.var rho}.", + "i" = "{.var rho} must be in the range (0, 1].")) + } + if (round(rho * nRandomStarts) < 1) { + cli::cli_abort(c("Too few nearest neighbors to take the consensus. ", + "i" = "Please use a larger {.var rho} or/and a larger {.var nRandomStarts}.")) + } + + barcodeList <- lapply(object, colnames) + allFeatures <- lapply(object, rownames) + features <- Reduce(.same, allFeatures) + + Ws <- list() + # Each dataset got a list of replicate V matrices + Vs <- stats::setNames(rep(list(list()), length(object)), names(object)) + # Each dataset got a H matrices + Hs <- stats::setNames(rep(list(NULL), length(object)), names(object)) + if (isTRUE(verbose)) + cli::cli_progress_bar(name = "Replicating iNMF runs", + status = sprintf("[ 0 / %d ]", nRandomStarts), + total = nRandomStarts) + set.seed(seed) + for (i in seq(nRandomStarts)) { + repName <- paste0("rep_", i) + out <- RcppPlanc::inmf(objectList = object, k = k, lambda = lambda, + niter = nIteration, Hinit = HInit, + Vinit = VInit, Winit = WInit, nCores = nCores, + verbose = FALSE) + Ws[[repName]] <- out$W + for (n in names(object)) Vs[[n]][[repName]] <- out$V[[n]] + for (n in names(object)) Hs[[n]][[repName]] <- out$H[[n]] + if (isTRUE(verbose)) + cli::cli_progress_update( + status = sprintf("[ %d / %d ]", i, nRandomStarts), set = i + ) + } + # return(list(W = Ws, V = Vs, H = Hs)) + if (isTRUE(verbose)) cliID <- cli::cli_process_start("Taking the consensus") + # Attempt 1 #### + # W <- takeConsensus(Ws, rho = rho, tao = tao) + # Vs <- lapply(Vs, takeConsensus, rho = rho, tao = tao) + + # Attempt 2 #### + # factorSelect <- takeConsensus2(Ws, rho = rho, tao = tao) + # W <- Reduce(cbind, Ws)[, factorSelect$idx] + # W <- colNormalize_dense_cpp(W, L = 2) + # W <- colAggregateMedian_dense_cpp(W, group = factorSelect$cluster - 1, n = k) + # W <- colNormalize_dense_cpp(W, L = 1) + # for (i in seq_along(object)) { + # V <- Reduce(cbind, Vs[[i]])[, factorSelect$idx] + # V <- colNormalize_dense_cpp(V, L = 2) + # V <- colAggregateMedian_dense_cpp(V, group = factorSelect$cluster - 1, n = k) + # Vs[[i]] <- colNormalize_dense_cpp(V, L = 1) + # } + + # Attempt 3 #### + # W <- Reduce(cbind, Ws) + # selection <- cluster_sel(W, nNeighbor = 3, k = k, resolution = 0.8) + # W <- W[, selection$idx] + # W <- colNormalize_dense_cpp(W, L = 2) + # print(table(selection$cluster - 1)) + # W <- colAggregateMedian_dense_cpp(W, group = selection$cluster, n = k) + # W <- colNormalize_dense_cpp(W, L = 1) + # for (i in seq_along(Vs)) { + # V <- Reduce(cbind, Vs[[i]]) + # V <- V[, selection$idx] + # V <- colNormalize_dense_cpp(V, L = 2) + # V <- colAggregateMedian_dense_cpp(V, group = selection$cluster, n = k) + # Vs[[i]] <- colNormalize_dense_cpp(V, L = 1) + # } + + # Attempt 4 #### + nNeighbor <- round(rho * nRandomStarts) + geneLoadings <- list(Reduce(cbind, Ws)) + for (i in seq_along(object)) { + geneLoadings[[i + 1]] <- Reduce(cbind, Vs[[i]]) + } + geneLoadings <- lapply(geneLoadings, colNormalize_dense_cpp, L = 2) + # geneLoadings is a list, each element is a ngene by (nFactor * nRandomStart) matrix + # The first is for W, the rest are for Vs + selection <- factor_cluster_sel(geneLoadings, nNeighbor = nNeighbor, + minWeight = 0.6, k = k, resolution = 0.2) + W <- geneLoadings[[1]][, selection$idx] + W <- colAggregateMedian_dense_cpp(W, group = selection$cluster, n = k) + W <- colNormalize_dense_cpp(W, L = 1) + for (i in seq_along(Vs)) { + V <- geneLoadings[[i + 1]][, selection$idx] + V <- colAggregateMedian_dense_cpp(V, group = selection$cluster, n = k) + Vs[[i]] <- colNormalize_dense_cpp(V, L = 1) + } + + if (isTRUE(verbose)) cli::cli_process_done(id = cliID) + + msg <- "Solving the last ANLS iterations" + if (isTRUE(verbose)) cliID <- cli::cli_process_start(msg = msg) + for (i in seq_along(object)) { + Hs[[i]] <- inmf_solveH(NULL, W, Vs[[i]], object[[i]], lambda, nCores = nCores) + } + # for (i in seq_along(object)) { + # Vs[[i]] <- inmf_solveV(Hs[[i]], W, NULL, object[[i]], lambda, nCores = nCores) + # } + # objErr <- sum(sapply(seq_along(object), function(i) + # inmf_objErr_i(H = Hs[[i]], W = W, V = Vs[[i]], E = object[[i]], + # lambda = lambda) + # )) + # if (isTRUE(verbose)) + # cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {objErr}")) + + # factors <- paste0("Factor_", seq(k)) + # dimnames(W) <- list(features, factors) + # for (i in seq_along(object)) { + # dimnames(Hs[[i]]) <- list(barcodeList[[i]], factors) + # Hs[[i]] <- t(Hs[[i]]) + # dimnames(Vs[[i]]) <- list(features, factors) + # } + # names(Hs) <- names(Vs) <- names(object) + # result <- list(W = W, H = Hs, V = Vs, objErr = objErr) + # return(result) + + out <- .runINMF.list(object, k = k, lambda = lambda, nIteration = 1, + HInit = Hs, WInit = W, VInit = Vs, verbose = FALSE) + if (isTRUE(verbose)) + cli::cli_process_done(id = cliID, msg_done = paste(msg, " ... objective error: {out$objErr}")) + return(out) +} + + +# takeConsensus <- function(matList, rho = 0.3, tao = 0.1) { +# ## According to cNMF method. +# ## G - The program matrix. what we call W or V +# ## rho - fraction parameter to set number of nearest neighbors to look at +# ## tao - distance threshold to filter out factors +# +# ## matList are matrices of dimensionality gene x factor +# # Step 1: Concatenate and l2 normalize +# ## R - a number of replicates +# G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart) +# G_R <- colNormalize_dense_cpp(G_R, L = 2) +# +# # Step 2: Find kNN matching for each component (factor) +# # and filter by mean distance against the kNN of each +# nNeighbor <- round(rho * length(matList)) +# knn <- RANN::nn2(t(G_R), k = nNeighbor + 1) +# +# # `select_factor_cpp` is a C++ function that returns the indices of the +# # factors that are selected by examining whether the mean euclidean distance +# # to its NNs is less than the threshold `tao`. +# selectedIdx <- rowMeans(knn$nn.dist[,2:(nNeighbor + 1)]) < tal +# # selectedIdx <- select_factor_cpp(all_data = G_R, +# # knn = knn$nn.idx - 1, +# # threshold = tao) +# +# G_R <- G_R[, selectedIdx] # genes by selected factors +# if (ncol(G_R) < ncol(matList[[1]])) { +# cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.", +# "i" = "Please try: ", +# "*" = "a larger {.var tao} for loosen threshold", +# "*" = "a larger {.var nRandomStarts} for more replicates to be used.")) +# } +# # Step 3: Kmeans clustering to get the k consensus +# cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster +# G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]])) +# G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1) +# return(G_consensus) +# } + +# takeConsensus2 <- function(matList, rho = 0.3, tao = 0.1) { +# # 2nd attempt of methods +# # Return the selection of factors from the replicate pool as well as the +# # grouping on the selected ones. So this part of information will be used +# # to select the V factors that represent the same thing as what we want +# # from W +# +# ## According to cNMF method. +# ## G - The program matrix. what we call W or V +# ## rho - fraction parameter to set number of nearest neighbors to look at +# ## tao - distance threshold to filter out factors +# +# ## matList are matrices of dimensionality gene x factor +# # Step 1: Concatenate and l2 normalize +# ## R - a number of replicates +# G_R <- Reduce(cbind, matList) # ngene by (nFactor * nRandomStart) +# G_R <- colNormalize_dense_cpp(G_R, L = 2) +# +# # Step 2: Find kNN matching for each component (factor) +# # and filter by mean distance against the kNN of each +# nNeighbor <- round(rho * length(matList)) +# knn <- RANN::nn2(t(G_R), k = nNeighbor + 1) +# nnDistMeans <- rowMeans(knn$nn.dist[,2:(nNeighbor + 1)]) +# selectedIdx <- nnDistMeans < tao +# # `select_factor_cpp` is a C++ function that returns the indices of the +# # factors that are selected by examining whether the mean euclidean distance +# # to its NNs is less than the threshold `tao`. +# # selectedIdx <- select_factor_cpp(all_data = G_R, +# # knn = knn$nn.idx - 1, +# # threshold = tao) +# +# G_R <- G_R[, selectedIdx] # genes by selected factors +# if (ncol(G_R) < ncol(matList[[1]])) { +# cli::cli_abort(c("Too few factors are selected from the pool to take the consensus.", +# "i" = "Please try: ", +# "*" = "a larger {.var tao} for loosen threshold", +# "*" = "a larger {.var nRandomStarts} for more replicates to be used.")) +# } +# # Step 3: Kmeans clustering to get the k consensus +# cluster <- stats::kmeans(t(G_R), centers = ncol(matList[[1]]))$cluster +# return(list(idx = selectedIdx, cluster = cluster)) +# # G_consensus <- colAggregateMedian_dense_cpp(x = G_R, group = cluster - 1, n = ncol(matList[[1]])) +# # G_consensus <- colNormalize_dense_cpp(G_consensus, L = 1) +# # return(G_consensus) +# } + +inmf_solveH <- function(H, W, V, E, lambda, nCores = 2L) { + WV <- W + V + CtC <- t(WV) %*% WV + lambda * t(V) %*% V + CtB <- t(WV) %*% E + H <- RcppPlanc::bppnnls_prod(CtC, as.matrix(CtB), nCores = nCores) + return(t(H)) # return cell x factor +} + +# inmf_solveV <- function(H, W, V, E, lambda, nCores = 2L) { +# HtH <- t(H) %*% H +# CtC <- (1 + lambda) * HtH +# CtB <- t(H) %*% t(E) - HtH %*% t(W) +# V <- RcppPlanc::bppnnls_prod(CtC, as.matrix(CtB), nCores = nCores) +# return(t(V)) +# } + +# inmf_solveW <- function(Hs, W, Vs, Es, lambda, nCores = 2L) { +# CtC <- matrix(0, ncol(Vs[[1]]), ncol(Vs[[1]])) +# CtB <- matrix(0, ncol(Vs[[1]]), nrow(Vs[[1]])) +# for (i in seq_along(Es)) { +# HtH <- t(Hs[[i]]) %*% Hs[[i]] +# CtC <- CtC + HtH +# CtB <- CtB + as.matrix(t(Hs[[i]]) %*% t(Es[[i]])) - HtH %*% t(Vs[[i]]) +# } +# W <- RcppPlanc::bppnnls_prod(CtC, CtB, nCores = nCores) +# return(t(W)) +# } + +# inmf_objErr_i <- function(H, W, V, E, lambda) { +# # Objective error function was originally stated as: +# # obj_i = ||E_i - (W + V_i)*H_i||_F^2 + lambda * ||V_i*H_i||_F^2 +# # (Use caution with the matrix dimensionality, as we might transpose them +# # occasionally in different implementation.) +# # +# # Let L = W + V +# # ||E - LH||_F^2 = ||E||_F^2 - 2*Tr(Ht*(Et*L)) + Tr((Lt*L)*(Ht*H)) +# # ||V*H||_F^2 = Tr((Vt*V)*(Ht*H)) +# # This improves the performance in both speed and memory usage. +# L <- W + V +# sqnormE <- Matrix::norm(E, "F")^2 +# LtL <- t(L) %*% L +# HtH <- t(H) %*% H +# TrLtLHtH <- sum(diag(LtL %*% HtH)) +# EtL <- t(E) %*% L +# TrHtEtL <- sum(Matrix::diag(t(H) %*% EtL)) +# VtV <- t(V) %*% V +# TrVtVHtH <- sum(diag(VtV %*% HtH)) +# obj <- sqnormE - 2 * TrHtEtL + TrLtLHtH + lambda * TrVtVHtH +# return(obj) +# } + + +# cluster_sel <- function( +# W, +# nNeighbor = 3, +# k = 20, +# resolution = 0.8 +# ) { +# knn <- RANN::nn2(t(W), k = nNeighbor + 1) +# edges <- numeric() +# for (i in 1:nrow(knn$nn.idx)) { +# for (j in 2:(nNeighbor + 1)) { +# edges <- c(edges, i, knn$nn.idx[i, j]) +# } +# } +# weights <- numeric() +# for (i in 1:nrow(knn$nn.idx)) { +# for (j in 2:(nNeighbor + 1)) { +# weights <- c(weights, knn$nn.dist[i, j]) +# } +# } +# weights <- exp(-weights/0.5) +# graph <- igraph::graph(edges, directed = FALSE) +# cluster <- clusterGraph(graph, weights, resolution, k) +# select <- names(sort(table(cluster), decreasing = TRUE))[1:k] +# idx <- cluster %in% select +# cluster <- cluster[cluster %in% select] +# cluster <- as.integer(factor(cluster)) - 1L +# return(list(idx = idx, cluster = cluster)) +# } + +# clusterGraph <- function(graph, weights, resolution, minCluster) { +# cluster <- igraph::cluster_leiden(graph, resolution_parameter = resolution, +# weights = weights, objective_function = "mod") +# cluster <- cluster$membership +# if (length(unique(cluster)) <= minCluster) { +# cluster <- clusterGraph(graph, weights, resolution + 0.1, minCluster) +# } +# return(cluster) +# } + + + +factor_cluster_sel <- function(geneLoadings, nNeighbor = 3, minWeight = 0.6, k = 20, + resolution = 0.2) { + graphList <- lapply(geneLoadings, function(w) { + knn <- RANN::nn2(t(w), k = nNeighbor + 1) + target <- as.integer(t(knn$nn.idx)) + source <- rep(seq_len(nrow(knn$nn.idx)), each = ncol(knn$nn.idx)) + weight <- as.numeric(t(knn$nn.dists)) + weight <- exp(-weight/0.5) + graphmat <- cbind(source, target, weight) + graphmat <- graphmat[graphmat[,3] > minWeight,] + return(graphmat) + }) + graphmat <- Reduce(rbind, graphList) + + # Leiden cluster. CPP implementation expects 0-based vertex index + edgevec <- as.integer(t(graphmat[,1:2])) - 1L + cluster <- leidenAlg::find_partition_with_rep_rcpp( + edgelist = edgevec, edgelist_length = length(edgevec), + num_vertices = ncol(geneLoadings[[1]]), direction = FALSE, niter = 5, + edge_weights = graphmat[,3], resolution = resolution, nrep = 20 + ) + + # Select top k clusters + selIdx <- cluster %in% names(sort(table(cluster), decreasing = TRUE))[seq_len(k)] + cluster <- cluster[selIdx] + cluster <- as.integer(factor(cluster)) - 1L + return(list(idx = selIdx, cluster = cluster)) +} diff --git a/R/clustering.R b/R/clustering.R index 5b153588..c16e0e71 100644 --- a/R/clustering.R +++ b/R/clustering.R @@ -82,7 +82,7 @@ runCluster <- function( if (!is.null(useDims)) H <- H[, useDims, drop = FALSE] if (isTRUE(verbose)) - cli::cli_process_start("{method} clustering on {type} cell factor loadings...") + cli::cli_process_start("{method} clustering on {type} cell factor loadings") knn <- RANN::nn2(H, k = nNeighbors, eps = eps) snn <- ComputeSNN(knn$nn.idx, prune = prune) if (!is.null(seed)) set.seed(seed) @@ -98,9 +98,7 @@ runCluster <- function( niter = nIterations, nrep = nRandomStarts ) } else { - edgeOutPath <- paste0("edge_", sub("\\s", "_", Sys.time()), '.txt') - edgeOutPath <- gsub("-", "", edgeOutPath) - edgeOutPath <- gsub(":", "", edgeOutPath) + edgeOutPath <- tempfile(pattern = "edge_", fileext = ".txt") WriteEdgeFile(snn, edgeOutPath, display_progress = FALSE) clusts <- RunModularityClusteringCpp( snn, diff --git a/R/embedding.R b/R/embedding.R index 7d42dd6e..7d738e9d 100644 --- a/R/embedding.R +++ b/R/embedding.R @@ -73,7 +73,7 @@ runUMAP <- function( useRaw <- Hsearch$useRaw type <- ifelse(useRaw, "unnormalized", "quantile normalized") if (isTRUE(verbose)) - cli::cli_process_start("Generating UMAP on {type} cell factor loadings...") + cli::cli_process_start("Generating UMAP on {type} cell factor loadings") if (!is.null(useDims)) H <- H[, useDims, drop = FALSE] umap <- uwot::umap(H, n_components = as.integer(nDims), @@ -166,7 +166,7 @@ runTSNE <- function( useRaw <- Hsearch$useRaw type <- ifelse(useRaw, "unnormalized", "quantile normalized") if (isTRUE(verbose)) - cli::cli_process_start("Generating TSNE ({method}) on {type} cell factor loadings...") + cli::cli_process_start("Generating TSNE ({method}) on {type} cell factor loadings") if (!is.null(useDims)) H <- H[, useDims, drop = FALSE] if (method == "Rtsne") { set.seed(seed) diff --git a/R/integration.R b/R/integration.R index 41130fd6..9e817ac7 100644 --- a/R/integration.R +++ b/R/integration.R @@ -18,7 +18,7 @@ #' \code{"iNMF"}, \code{"onlineINMF"}, \code{"UINMF"}. Default \code{"iNMF"}. #' @param seed Random seed to allow reproducible results. Default \code{1}. #' @param verbose Logical. Whether to show information of the progress. Default -#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. +#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to other methods and wrapped functions. #' @export #' @rdname runIntegration @@ -60,7 +60,7 @@ runIntegration.liger <- function( lambda = 5.0, method = c("iNMF", "onlineINMF", "UINMF"), seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) { method <- match.arg(method) @@ -95,7 +95,7 @@ runIntegration.Seurat <- function( useLayer = "ligerScaleData", assay = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) { method <- match.arg(method) @@ -176,8 +176,10 @@ runIntegration.Seurat <- function( #' each element is the initial \eqn{V} matrix of each dataset. Default #' \code{NULL}. #' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. #' @param verbose Logical. Whether to show information of the progress. Default -#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. +#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to methods. #' @rdname runINMF #' @export @@ -235,7 +237,8 @@ runINMF.liger <- function( WInit = NULL, VInit = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) { .checkObjVersion(object) @@ -260,9 +263,8 @@ runINMF.liger <- function( WInit = WInit, VInit = VInit, seed = seed, - verbose = verbose, - barcodeList = lapply(datasets(object), colnames), - features = varFeatures(object) + nCores = nCores, + verbose = verbose ) object@W <- out$W @@ -307,7 +309,8 @@ runINMF.Seurat <- function( WInit = NULL, VInit = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) { assay <- assay %||% SeuratObject::DefaultAssay(object) @@ -339,6 +342,7 @@ runINMF.Seurat <- function( WInit = WInit, VInit = VInit, seed = seed, + nCores = nCores, verbose = verbose ) Hconcat <- t(Reduce(cbind, res$H)) @@ -373,9 +377,8 @@ runINMF.Seurat <- function( WInit = NULL, VInit = NULL, seed = 1, - verbose = getOption("ligerVerbose"), - barcodeList = NULL, - features = NULL + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE) ) { if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start cli::cli_abort( @@ -383,6 +386,10 @@ runINMF.Seurat <- function( Please install it by command: {.code devtools::install_github('welch-lab/RcppPlanc')}") # nocov end + barcodeList <- lapply(object, colnames) + allFeatures <- lapply(object, rownames) + features <- Reduce(.same, allFeatures) + bestResult <- list() bestObj <- Inf bestSeed <- seed @@ -393,7 +400,7 @@ runINMF.Seurat <- function( set.seed(seed = seed + i - 1) out <- RcppPlanc::inmf(objectList = object, k = k, lambda = lambda, niter = nIteration, Hinit = HInit, - Vinit = VInit, Winit = WInit, + Vinit = VInit, Winit = WInit, nCores = nCores, verbose = verbose) if (out$objErr < bestObj) { bestResult <- out @@ -404,8 +411,7 @@ runINMF.Seurat <- function( if (isTRUE(verbose) && nRandomStarts > 1) { cli::cli_alert_success("Best objective error: {bestObj}; Best seed: {bestSeed}") } - barcodeList <- lapply(object, colnames) - features <- rownames(object[[1]]) + factorNames <- paste0("Factor_", seq(k)) for (i in seq_along(object)) { bestResult$H[[i]] <- t(bestResult$H[[i]]) @@ -592,8 +598,10 @@ optimizeALS <- function( # nocov start #' @param minibatchSize Total number of cells in each minibatch. See detail. #' Default \code{5000}. #' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. #' @param verbose Logical. Whether to show information of the progress. Default -#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. +#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to other S3 methods of this function. #' @return #' \itemize{ @@ -669,7 +677,8 @@ runOnlineINMF.liger <- function( AInit = NULL, BInit = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) { .checkObjVersion(object) @@ -749,7 +758,7 @@ runOnlineINMF.liger <- function( minibatchSize = minibatchSize, HALSiter = HALSiter, verbose = verbose, WInit = WInit, VInit = VInit, AInit = AInit, - BInit = BInit, seed = seed) + BInit = BInit, seed = seed, nCores = nCores) if (!isTRUE(projection)) { # Scenario 1&2, everything updated for (i in seq_along(object)) { @@ -798,7 +807,8 @@ runOnlineINMF.liger <- function( HALSiter = 1, minibatchSize = 5000, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) { if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start @@ -809,7 +819,9 @@ runOnlineINMF.liger <- function( nDatasets <- length(object) + length(newDatasets) barcodeList <- c(lapply(object, colnames), lapply(newDatasets, colnames)) names(barcodeList) <- c(names(object), names(newDatasets)) - features <- rownames(object[[1]]) + allFeatures <- c(lapply(object, rownames), lapply(newDatasets, rownames)) + features <- Reduce(.same, allFeatures) + if (!is.null(seed)) set.seed(seed) res <- RcppPlanc::onlineINMF(objectList = object, newDatasets = newDatasets, @@ -818,7 +830,7 @@ runOnlineINMF.liger <- function( minibatchSize = minibatchSize, maxHALSIter = HALSiter, Vinit = VInit, Winit = WInit, Ainit = AInit, Binit = BInit, - verbose = verbose) + nCores = nCores, verbose = verbose) factorNames <- paste0("Factor_", seq(k)) if (isTRUE(projection)) { # Scenario 3 only got H for new datasets @@ -868,7 +880,8 @@ runOnlineINMF.Seurat <- function( HALSiter = 1, minibatchSize = 5000, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) { assay <- assay %||% SeuratObject::DefaultAssay(object) @@ -895,6 +908,7 @@ runOnlineINMF.Seurat <- function( newDatasets = NULL, projection = FALSE, maxEpochs = maxEpochs, HALSiter = HALSiter, minibatchSize = minibatchSize, seed = seed, verbose = verbose, + nCores = nCores, WInit = NULL, VInit = NULL, AInit = NULL, BInit = NULL, ) Hconcat <- t(Reduce(cbind, res$H)) @@ -1061,8 +1075,10 @@ online_iNMF <- function( # nocov start #' the random seed by 1 for each consecutive restart, so future factorization #' of the same dataset can be run with one rep if necessary. Default \code{1}. #' @param seed Random seed to allow reproducible results. Default \code{1}. +#' @param nCores The number of parallel tasks to speed up the computation. +#' Default \code{2L}. Only supported for platform with OpenMP support. #' @param verbose Logical. Whether to show information of the progress. Default -#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. +#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to other methods and wrapped functions. #' @export #' @references April R. Kriebel and Joshua D. Welch, UINMF performs mosaic @@ -1102,10 +1118,6 @@ runUINMF <- function( object, k = 20, lambda = 5, - nIteration = 30, - nRandomStarts = 1, - seed = 1, - verbose = getOption("ligerVerbose"), ... ) { UseMethod("runUINMF", object) @@ -1121,7 +1133,8 @@ runUINMF.liger <- function( nIteration = 30, nRandomStarts = 1, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) { .checkObjVersion(object) @@ -1143,7 +1156,7 @@ runUINMF.liger <- function( } res <- .runUINMF.list(Elist, Ulist, k = k, lambda = lambda, nIteration = nIteration, - nRandomStarts = nRandomStarts, + nRandomStarts = nRandomStarts, nCores = nCores, seed = seed, verbose = verbose, ...) for (d in names(object)) { ld <- dataset(object, d) @@ -1169,13 +1182,18 @@ runUINMF.liger <- function( nIteration = 30, nRandomStarts = 1, seed = 1, - verbose = getOption("ligerVerbose") + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE) ) { if (!requireNamespace("RcppPlanc", quietly = TRUE)) # nocov start cli::cli_abort( "Package {.pkg RcppPlanc} is required for mosaic iNMF integration with unshared features. Please install it by command: {.code devtools::install_github('welch-lab/RcppPlanc')}")# nocov end + barcodeList <- lapply(object, colnames) + allFeatures <- lapply(object, rownames) + features <- Reduce(.same, allFeatures) + bestObj <- Inf bestRes <- NULL bestSeed <- NULL @@ -1184,7 +1202,8 @@ runUINMF.liger <- function( seed <- seed + i - 1 set.seed(seed) res <- RcppPlanc::uinmf(object, unsharedList, k = k, lambda = lambda, - niter = nIteration, verbose = verbose) + niter = nIteration, nCores = nCores, + verbose = verbose) if (res$objErr < bestObj) { bestRes <- res bestObj <- res$objErr @@ -1195,13 +1214,11 @@ runUINMF.liger <- function( cli::cli_alert_success("Best objective error: {bestObj}; Best seed: {bestSeed}") } rm(res) - features <- rownames(object[[1]]) unsharedFeatures <- lapply(unsharedList, rownames) factorNames <- paste0("Factor_", seq(k)) - barcodes <- lapply(object, colnames) for (d in names(object)) { bestRes$H[[d]] <- t(bestRes$H[[d]]) - dimnames(bestRes$H[[d]]) <- list(factorNames, barcodes[[d]]) + dimnames(bestRes$H[[d]]) <- list(factorNames, barcodeList[[d]]) dimnames(bestRes$V[[d]]) <- list(features, factorNames) if (d %in% names(bestRes$U)) { dimnames(bestRes$U[[d]]) <- list(unsharedFeatures[[d]], factorNames) @@ -1259,7 +1276,7 @@ runUINMF.liger <- function( #' Default \code{"quantileNorm_cluster"} #' @param seed Random seed to allow reproducible results. Default \code{1}. #' @param verbose Logical. Whether to show information of the progress. Default -#' \code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set. +#' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Arguments passed to other S3 methods of this function. #' @return Updated input object #' \itemize{ @@ -1306,7 +1323,7 @@ quantileNorm.liger <- function( refineKNN = TRUE, clusterName = "quantileNorm_cluster", seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) { .checkObjVersion(object) @@ -1354,7 +1371,7 @@ quantileNorm.Seurat <- function( refineKNN = TRUE, clusterName = "quantileNorm_cluster", seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) { resName <- paste0(reduction, "Norm") @@ -1402,7 +1419,7 @@ quantileNorm.Seurat <- function( eps = 0.9, refineKNN = TRUE, seed = 1, - verbose = getOption("ligerVerbose") + verbose = getOption("ligerVerbose", TRUE) ) { set.seed(seed) if (is.character(reference)) { @@ -1543,7 +1560,7 @@ quantile_norm <- function( # nocov start refine.knn = TRUE, clusterName = "H.norm_cluster", rand.seed = 1, - verbose = getOption("ligerVerbose") + verbose = getOption("ligerVerbose", TRUE) ) { lifecycle::deprecate_warn("1.99.0", "quantile_norm()", "quantileNorm()") quantileNorm( @@ -1561,3 +1578,12 @@ quantile_norm <- function( # nocov start verbose = verbose ) } # nocov end + + + + + +.same <- function(x, y) { + if (identical(x, y)) return(x) + else cli::cli_abort("Different features are used for each dataset.") +} diff --git a/man/quantileNorm.Rd b/man/quantileNorm.Rd index b779cd46..2837dd69 100644 --- a/man/quantileNorm.Rd +++ b/man/quantileNorm.Rd @@ -21,7 +21,7 @@ quantileNorm(object, ...) refineKNN = TRUE, clusterName = "quantileNorm_cluster", seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) @@ -39,7 +39,7 @@ quantileNorm(object, ...) refineKNN = TRUE, clusterName = "quantileNorm_cluster", seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) } @@ -87,7 +87,7 @@ Default \code{"quantileNorm_cluster"}} \item{seed}{Random seed to allow reproducible results. Default \code{1}.} \item{verbose}{Logical. Whether to show information of the progress. Default -\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} \item{reduction}{Name of the reduction where LIGER integration result is stored. Default \code{"inmf"}.} diff --git a/man/rliger2-deprecated.Rd b/man/rliger2-deprecated.Rd index fe797053..a8ae7a2e 100644 --- a/man/rliger2-deprecated.Rd +++ b/man/rliger2-deprecated.Rd @@ -79,7 +79,7 @@ quantile_norm( refine.knn = TRUE, clusterName = "H.norm_cluster", rand.seed = 1, - verbose = getOption("ligerVerbose") + verbose = getOption("ligerVerbose", TRUE) ) } \description{ diff --git a/man/runCINMF.Rd b/man/runCINMF.Rd new file mode 100644 index 00000000..8142b618 --- /dev/null +++ b/man/runCINMF.Rd @@ -0,0 +1,156 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cINMF.R +\name{runCINMF} +\alias{runCINMF} +\alias{runCINMF.liger} +\alias{runCINMF.Seurat} +\title{Perform consensus iNMF on scaled datasets} +\usage{ +runCINMF(object, k = 20, lambda = 5, rho = 0.3, ...) + +\method{runCINMF}{liger}( + object, + k = 20, + lambda = 5, + rho = 0.3, + nIteration = 30, + nRandomStarts = 10, + HInit = NULL, + WInit = NULL, + VInit = NULL, + seed = 1, + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), + ... +) + +\method{runCINMF}{Seurat}( + object, + k = 20, + lambda = 5, + rho = 0.3, + datasetVar = "orig.ident", + layer = "ligerScaleData", + assay = NULL, + reduction = "cinmf", + nIteration = 30, + nRandomStarts = 10, + HInit = NULL, + WInit = NULL, + VInit = NULL, + seed = 1, + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), + ... +) +} +\arguments{ +\item{object}{A \linkS4class{liger} object or a Seurat object with +non-negative scaled data of variable features (Done with +\code{\link{scaleNotCenter}}).} + +\item{k}{Inner dimension of factorization (number of factors). Generally, a +higher \code{k} will be needed for datasets with more sub-structure. Default +\code{20}.} + +\item{lambda}{Regularization parameter. Larger values penalize +dataset-specific effects more strongly (i.e. alignment should increase as +\code{lambda} increases). Default \code{5}.} + +\item{rho}{Numeric number between 0 and 1. Fraction for determining the +number of nearest neighbors to look at for consensus (by +\code{rho * nRandomStarts}). Default \code{0.3}.} + +\item{...}{Arguments passed to methods.} + +\item{nIteration}{Total number of block coordinate descent iterations to +perform. Default \code{30}.} + +\item{nRandomStarts}{Number of replicate runs for creating the pool of +factorization results. Default \code{10}.} + +\item{HInit}{Initial values to use for \eqn{H} matrices. A list object where +each element is the initial \eqn{H} matrix of each dataset. Default +\code{NULL}.} + +\item{WInit}{Initial values to use for \eqn{W} matrix. A matrix object. +Default \code{NULL}.} + +\item{VInit}{Initial values to use for \eqn{V} matrices. A list object where +each element is the initial \eqn{V} matrix of each dataset. Default +\code{NULL}.} + +\item{seed}{Random seed to allow reproducible results. Default \code{1}.} + +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} + +\item{verbose}{Logical. Whether to show information of the progress. Default +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} + +\item{datasetVar}{Metadata variable name that stores the dataset source +annotation. Default \code{"orig.ident"}.} + +\item{layer}{For Seurat>=4.9.9, the name of layer to retrieve input +non-negative scaled data. Default \code{"ligerScaleData"}. For older Seurat, +always retrieve from \code{scale.data} slot.} + +\item{assay}{Name of assay to use. Default \code{NULL} uses current active +assay.} + +\item{reduction}{Name of the reduction to store result. Also used as the +feature key. Default \code{"cinmf"}.} +} +\value{ +\itemize{ + \item{liger method - Returns updated input \linkS4class{liger} object + \itemize{ + \item{A list of all \eqn{H} matrices can be accessed with + \code{getMatrix(object, "H")}} + \item{A list of all \eqn{V} matrices can be accessed with + \code{getMatrix(object, "V")}} + \item{The \eqn{W} matrix can be accessed with + \code{getMatrix(object, "W")}} + }} + \item{Seurat method - Returns updated input Seurat object + \itemize{ + \item{\eqn{H} matrices for all datasets will be concatenated and + transposed (all cells by k), and form a DimReduc object in the + \code{reductions} slot named by argument \code{reduction}.} + \item{\eqn{W} matrix will be presented as \code{feature.loadings} in the + same DimReduc object.} + \item{\eqn{V} matrices, an objective error value and the dataset + variable used for the factorization is currently stored in + \code{misc} slot of the same DimReduc object.} + }} +} +} +\description{ +Performs consensus integrative non-negative matrix factorization (c-iNMF) +to return factorized \eqn{H}, \eqn{W}, and \eqn{V} matrices. We run the +regular iNMF multiple times with different random starts, and then take the +consensus of frequently appearing factors from gene loading matrices, \eqn{W} +and \eqn{V}. The cell factor loading \eqn{H} matrices are eventually solved +with the consensus \eqn{W} and \eqn{V} matrices. + +Please see \code{\link{runINMF}} for detailed introduction to the regular +iNMF algorithm which is run multiple times in this function. + +The consensus iNMF algorithm is developed basing on the consensus NMF (cNMF) +method (D. Kotliar et al., 2019). +} +\examples{ +pbmc <- normalize(pbmc) +pbmc <- selectGenes(pbmc) +pbmc <- scaleNotCenter(pbmc) +if (requireNamespace("RcppPlanc", quietly = TRUE)) { + pbmc <- runCINMF(pbmc) +} +} +\references{ +Joshua D. Welch and et al., Single-Cell Multi-omic Integration Compares and +Contrasts Features of Brain Cell Identity, Cell, 2019 + +Dylan Kotliar and et al., Identifying gene expression programs of cell-type +identity and cellular activity with single-cell RNA-Seq, eLife, 2019 +} diff --git a/man/runINMF.Rd b/man/runINMF.Rd index 853c2645..78dcea44 100644 --- a/man/runINMF.Rd +++ b/man/runINMF.Rd @@ -18,7 +18,8 @@ runINMF(object, k = 20, lambda = 5, ...) WInit = NULL, VInit = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) @@ -36,7 +37,8 @@ runINMF(object, k = 20, lambda = 5, ...) WInit = NULL, VInit = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) } @@ -77,8 +79,11 @@ each element is the initial \eqn{V} matrix of each dataset. Default \item{seed}{Random seed to allow reproducible results. Default \code{1}.} +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} + \item{verbose}{Logical. Whether to show information of the progress. Default -\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} \item{datasetVar}{Metadata variable name that stores the dataset source annotation. Default \code{"orig.ident"}.} diff --git a/man/runIntegration.Rd b/man/runIntegration.Rd index 67ec1a77..05648ab0 100644 --- a/man/runIntegration.Rd +++ b/man/runIntegration.Rd @@ -20,7 +20,7 @@ runIntegration( lambda = 5, method = c("iNMF", "onlineINMF", "UINMF"), seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) @@ -33,7 +33,7 @@ runIntegration( useLayer = "ligerScaleData", assay = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + verbose = getOption("ligerVerbose", TRUE), ... ) } @@ -58,7 +58,7 @@ dataset-specific effects more strongly (i.e. alignment should increase as \item{seed}{Random seed to allow reproducible results. Default \code{1}.} \item{verbose}{Logical. Whether to show information of the progress. Default -\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} \item{datasetVar}{Metadata variable name that stores the dataset source annotation. Default \code{"orig.ident"}.} diff --git a/man/runOnlineINMF.Rd b/man/runOnlineINMF.Rd index 2647b3a4..2751a942 100644 --- a/man/runOnlineINMF.Rd +++ b/man/runOnlineINMF.Rd @@ -22,7 +22,8 @@ runOnlineINMF(object, k = 20, lambda = 5, ...) AInit = NULL, BInit = NULL, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) @@ -38,7 +39,8 @@ runOnlineINMF(object, k = 20, lambda = 5, ...) HALSiter = 1, minibatchSize = 5000, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) } @@ -79,8 +81,11 @@ See detail. Default \code{NULL}.} \item{seed}{Random seed to allow reproducible results. Default \code{1}.} +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} + \item{verbose}{Logical. Whether to show information of the progress. Default -\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} \item{datasetVar}{Metadata variable name that stores the dataset source annotation. Default \code{"orig.ident"}.} diff --git a/man/runUINMF.Rd b/man/runUINMF.Rd index a19f6e20..3a13b1c7 100644 --- a/man/runUINMF.Rd +++ b/man/runUINMF.Rd @@ -5,16 +5,7 @@ \alias{runUINMF.liger} \title{Perform Mosaic iNMF (UINMF) on scaled datasets with unshared features} \usage{ -runUINMF( - object, - k = 20, - lambda = 5, - nIteration = 30, - nRandomStarts = 1, - seed = 1, - verbose = getOption("ligerVerbose"), - ... -) +runUINMF(object, k = 20, lambda = 5, ...) \method{runUINMF}{liger}( object, @@ -23,7 +14,8 @@ runUINMF( nIteration = 30, nRandomStarts = 1, seed = 1, - verbose = getOption("ligerVerbose"), + nCores = 2L, + verbose = getOption("ligerVerbose", TRUE), ... ) } @@ -40,6 +32,8 @@ higher \code{k} will be needed for datasets with more sub-structure. Default dataset-specific effects more strongly (i.e. alignment should increase as \code{lambda} increases). Default \code{5}.} +\item{...}{Arguments passed to other methods and wrapped functions.} + \item{nIteration}{Total number of block coordinate descent iterations to perform. Default \code{30}.} @@ -51,10 +45,11 @@ of the same dataset can be run with one rep if necessary. Default \code{1}.} \item{seed}{Random seed to allow reproducible results. Default \code{1}.} -\item{verbose}{Logical. Whether to show information of the progress. Default -\code{getOption("ligerVerbose")} which is \code{TRUE} if users have not set.} +\item{nCores}{The number of parallel tasks to speed up the computation. +Default \code{2L}. Only supported for platform with OpenMP support.} -\item{...}{Arguments passed to other methods and wrapped functions.} +\item{verbose}{Logical. Whether to show information of the progress. Default +\code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.} } \value{ \itemize{ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index d347733b..c179d266 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -31,6 +31,31 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// colNormalize_dense_cpp +arma::mat colNormalize_dense_cpp(arma::mat& x, const arma::uword L); +RcppExport SEXP _rliger2_colNormalize_dense_cpp(SEXP xSEXP, SEXP LSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< arma::mat& >::type x(xSEXP); + Rcpp::traits::input_parameter< const arma::uword >::type L(LSEXP); + rcpp_result_gen = Rcpp::wrap(colNormalize_dense_cpp(x, L)); + return rcpp_result_gen; +END_RCPP +} +// colAggregateMedian_dense_cpp +arma::mat colAggregateMedian_dense_cpp(const arma::mat& x, const arma::uvec& group, const arma::uword n); +RcppExport SEXP _rliger2_colAggregateMedian_dense_cpp(SEXP xSEXP, SEXP groupSEXP, SEXP nSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const arma::mat& >::type x(xSEXP); + Rcpp::traits::input_parameter< const arma::uvec& >::type group(groupSEXP); + Rcpp::traits::input_parameter< const arma::uword >::type n(nSEXP); + rcpp_result_gen = Rcpp::wrap(colAggregateMedian_dense_cpp(x, group, n)); + return rcpp_result_gen; +END_RCPP +} // scaleNotCenter_byRow_rcpp arma::sp_mat scaleNotCenter_byRow_rcpp(arma::sp_mat x); RcppExport SEXP _rliger2_scaleNotCenter_byRow_rcpp(SEXP xSEXP) { @@ -297,6 +322,8 @@ END_RCPP static const R_CallMethodDef CallEntries[] = { {"_rliger2_RunModularityClusteringCpp", (DL_FUNC) &_rliger2_RunModularityClusteringCpp, 9}, + {"_rliger2_colNormalize_dense_cpp", (DL_FUNC) &_rliger2_colNormalize_dense_cpp, 2}, + {"_rliger2_colAggregateMedian_dense_cpp", (DL_FUNC) &_rliger2_colAggregateMedian_dense_cpp, 3}, {"_rliger2_scaleNotCenter_byRow_rcpp", (DL_FUNC) &_rliger2_scaleNotCenter_byRow_rcpp, 1}, {"_rliger2_scaleNotCenter_byRow_perDataset_rcpp", (DL_FUNC) &_rliger2_scaleNotCenter_byRow_perDataset_rcpp, 3}, {"_rliger2_rowVars_sparse_rcpp", (DL_FUNC) &_rliger2_rowVars_sparse_rcpp, 2}, diff --git a/src/cinmf_util.cpp b/src/cinmf_util.cpp new file mode 100644 index 00000000..d11b73ee --- /dev/null +++ b/src/cinmf_util.cpp @@ -0,0 +1,34 @@ +#include +// [[Rcpp::depends(RcppArmadillo)]] + +using namespace Rcpp; +using namespace arma; + +// [[Rcpp::export()]] +arma::mat colNormalize_dense_cpp(arma::mat& x, const arma::uword L) { + arma::mat result(x); + for (int j = 0; j < x.n_cols; ++j) { + double norm = arma::norm(x.col(j), L); + for (int i = 0; i < x.n_rows; ++i) { + result(i, j) /= norm; + } + } + return result; +} + +// x: n features by n selected factors +// group: n-selected-factor integer vector, pre-transformed to 0-base, +// from upstream kmeans clustering. +// [[Rcpp::export()]] +arma::mat colAggregateMedian_dense_cpp(const arma::mat& x, const arma::uvec& group, const arma::uword n) { + arma::mat result(x.n_rows, n); + for (int i = 0; i < n; ++i) { + arma::uvec idx = arma::find(group == i); + arma::mat sub_x = x.cols(idx); + arma::vec median = arma::median(sub_x, 1); + result.col(i) = median; + } + return result; +} + + diff --git a/vignettes/articles/STARmap_dropviz_vig.Rmd b/vignettes/articles/STARmap_dropviz_vig.Rmd index dfeab06d..dd2bd241 100644 --- a/vignettes/articles/STARmap_dropviz_vig.Rmd +++ b/vignettes/articles/STARmap_dropviz_vig.Rmd @@ -8,7 +8,6 @@ output: html_document ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE) library(rgl) -library(rglwidget) options(rgl.useNULL = TRUE) knitr::knit_hooks$set(webgl = hook_webgl) ``` @@ -67,7 +66,7 @@ lig <- scaleNotCenter(lig) ### 5. Mosaic integrative NMF -Unshared Integrative Non-negative Matrix Factorization (UINMF) can be applied with `runIntegration(..., method = "UINMF")`. A standalone function `runUINMF()` is also provided with more detailed documentation and initialization setup. This step produces factor gene loading matrices of the shared genes: $W$ for shared information across datasets, and $V$ for dataset specific information. Specific to UINMF method, additional factor gene loading matrix of unshared genes, $U$ is also produced. $H$ matrices, the cell factor loading matrices are produced for each dataset and can be interpreted as load rank representation of the cells. +Unshared Integrative Non-negative Matrix Factorization (UINMF) can be applied with `runIntegration(..., method = "UINMF")`. A standalone function `runUINMF()` is also provided with more detailed documentation and initialization setup. This step produces factor gene loading matrices of the shared genes: $W$ for shared information across datasets, and $V$ for dataset specific information. Specific to UINMF method, additional factor gene loading matrix of unshared features, $U$ is also produced. $H$ matrices, the cell factor loading matrices are produced for each dataset and can be interpreted as low-rank representation of the cells. In this tutorial, we set dataset specific lambda (regularization parameter) values to penalize the dataset specific effect differently.