diff --git a/src/methods/bbknn/config.vsh.yaml b/src/methods/bbknn/config.vsh.yaml new file mode 100644 index 00000000..cd656ace --- /dev/null +++ b/src/methods/bbknn/config.vsh.yaml @@ -0,0 +1,52 @@ +__merge__: /src/api/comp_method.yaml +name: bbknn +label: BBKNN +summary: BBKNN creates k nearest neighbours graph by identifying neighbours within + batches, then combining and processing them with UMAP for visualization. +description: | + "BBKNN or batch balanced k nearest neighbours graph is built for each cell by + identifying its k nearest neighbours within each defined batch separately, + creating independent neighbour sets for each cell in each batch. These sets + are then combined and processed with the UMAP algorithm for visualisation." +references: + doi: 10.1093/bioinformatics/btz625 +links: + repository: https://github.com/Teichlab/bbknn + documentation: https://github.com/Teichlab/bbknn#readme +info: + method_types: [graph] + preferred_normalization: log_cp10k + variants: + bbknn_full_unscaled: + bbknn_full_scaled: + preferred_normalization: log_cp10k_scaled +arguments: + - name: --annoy_n_trees + type: integer + default: 10 + description: Number of trees to use in the annoy forrest. + - name: --neighbors_within_batch + type: integer + default: 3 + description: Number of neighbors to report within each batch. + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. +resources: + - type: python_script + path: script.py + - type: python_script + path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - bbknn +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/methods/bbknn/script.py b/src/methods/bbknn/script.py new file mode 100644 index 00000000..078d40ce --- /dev/null +++ b/src/methods/bbknn/script.py @@ -0,0 +1,63 @@ +import sys +import anndata as ad +import scanpy as sc +import bbknn + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'annoy_n_trees': 10, + 'neighbors_within_batch': 3, + 'n_hvg': 2000, +} +meta = { + 'name': 'foo', + 'config': 'bar' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + sc.pp.pca(adata) + +print('Run BBKNN', flush=True) +kwargs = dict(batch_key='batch', copy=True) +kwargs['annoy_n_trees'] = par['annoy_n_trees'] +kwargs['neighbors_within_batch'] = par['neighbors_within_batch'] + +ad_bbknn = bbknn.bbknn(adata, **kwargs) + +print("Store output", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsp={ + 'connectivities': ad_bbknn.obsp['connectivities'], + 'distances': ad_bbknn.obsp['distances'], + }, + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + 'neighbors': ad_bbknn.uns['neighbors'] + } +) + +print("Store outputs", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/combat/config.vsh.yaml b/src/methods/combat/config.vsh.yaml new file mode 100644 index 00000000..c5763a9d --- /dev/null +++ b/src/methods/combat/config.vsh.yaml @@ -0,0 +1,42 @@ +__merge__: /src/api/comp_method.yaml +name: combat +label: Combat +summary: Adjusting batch effects in microarray expression data using empirical Bayes + methods +description: | + "An Empirical Bayes (EB) approach to correct for batch effects. It + estimates batch-specific parameters by pooling information across genes in + each batch and shrinks the estimates towards the overall mean of the batch + effect estimates across all genes. These parameters are then used to adjust + the data for batch effects, leading to more accurate and reproducible + results." +references: + doi: 10.1093/biostatistics/kxj037 +links: + repository: https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html + documentation: https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html +info: + method_types: [feature] + preferred_normalization: log_cp10k + variants: + combat_full_unscaled: + combat_full_scaled: + preferred_normalization: log_cp10k_scaled +arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. +resources: + - type: python_script + path: script.py + - type: python_script + path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, highmem, lowcpu] diff --git a/src/methods/combat/script.py b/src/methods/combat/script.py new file mode 100644 index 00000000..3d90f7bf --- /dev/null +++ b/src/methods/combat/script.py @@ -0,0 +1,56 @@ +import sys +import scanpy as sc +from scipy.sparse import csr_matrix + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, +} + +meta = { + 'name': 'foo', + 'config': 'bar' +} + +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + + +print('Run Combat', flush=True) +adata.X = sc.pp.combat(adata, key='batch', inplace=False) + + +print("Store output", flush=True) +output = sc.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + }, + layers={ + 'corrected_counts': csr_matrix(adata.X), + } +) + +print("Store outputs", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/fastmnn/config.vsh.yaml b/src/methods/fastmnn/config.vsh.yaml new file mode 100644 index 00000000..f65d97b3 --- /dev/null +++ b/src/methods/fastmnn/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: /src/api/comp_method.yaml +name: fastmnn +label: fastMnn +summary: A simpler version of the original mnnCorrect algorithm. +description: | + The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps. + + 1. Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality. + 2. Identify MNN pairs in the low-dimensional space between a reference batch and a target batch. + 3. Remove variation along the average batch vector in both reference and target batches. + 4. Correct the cells in the target batch towards the reference, using locally weighted correction vectors. + 5. Merge the corrected target batch with the reference, and repeat with the next target batch. +references: + doi: 10.1038/nbt.4091 +links: + repository: https://code.bioconductor.org/browse/batchelor/ + documentation: https://bioconductor.org/packages/batchelor/ +info: + method_types: [feature, embedding] + preferred_normalization: log_cp10k +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + bioc: batchelor +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, lowcpu, highmem] diff --git a/src/methods/fastmnn/script.R b/src/methods/fastmnn/script.R new file mode 100644 index 00000000..14d7bce2 --- /dev/null +++ b/src/methods/fastmnn/script.R @@ -0,0 +1,50 @@ +cat("Loading dependencies\n") +suppressPackageStartupMessages({ + requireNamespace("anndata", quietly = TRUE) + library(Matrix, warn.conflicts = FALSE) + requireNamespace("batchelor", quietly = TRUE) + library(SingleCellExperiment, warn.conflicts = FALSE) +}) + +## VIASH START +par <- list( + input = 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + output = 'output.h5ad' +) +meta <- list( + name = "mnn_correct_feature" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +# TODO: pass output of 'multiBatchNorm' to fastMNN + +cat("Run mnn\n") +out <- suppressWarnings(batchelor::fastMNN( + t(adata$layers[["normalized"]]), + batch = adata$obs[["batch"]] +)) + +layer <- as(SummarizedExperiment::assay(out, "reconstructed"), "sparseMatrix") +obsm <- SingleCellExperiment::reducedDim(out, "corrected") + +cat("Reformat output\n") +output <- anndata::AnnData( + layers = list( + corrected_counts = t(layer) + ), + obsm = list( + X_emb = obsm + ), + shape = adata$shape, + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ) +) + +cat("Write output to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/methods/liger/config.vsh.yaml b/src/methods/liger/config.vsh.yaml new file mode 100644 index 00000000..ada022b2 --- /dev/null +++ b/src/methods/liger/config.vsh.yaml @@ -0,0 +1,34 @@ +__merge__: /src/api/comp_method.yaml +name: liger +label: LIGER +summary: Linked Inference of Genomic Experimental Relationships +description: | + LIGER or linked inference of genomic experimental relationships uses iNMF + deriving and implementing a novel coordinate descent algorithm to efficiently + do the factorization. Joint clustering is performed and factor loadings are + normalised. +references: + doi: 10.1016/j.cell.2019.05.006 +links: + repository: https://github.com/welch-lab/liger + documentation: https://github.com/welch-lab/liger +info: + method_types: [embedding] + preferred_normalization: log_cp10k +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: apt + packages: cmake + - type: r + cran: rliger + github: welch-lab/RcppPlanc +runners: + - type: executable + - type: nextflow + directives: + label: [lowcpu, highmem, midtime] diff --git a/src/methods/liger/script.R b/src/methods/liger/script.R new file mode 100644 index 00000000..e6460c50 --- /dev/null +++ b/src/methods/liger/script.R @@ -0,0 +1,108 @@ +cat(">> Load dependencies\n") +requireNamespace("anndata", quietly = TRUE) +requireNamespace("rliger", quietly = TRUE) + +## VIASH START +par <- list( + input = "resources_test/batch_integration/pancreas/dataset.h5ad", + output = "output.h5ad" +) +meta <- list( + name = "liger" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +anndataToLiger <- function(adata) { + # fetch batch names + batch <- adata$obs$batch + batch_names <- as.character(unique(batch)) + + # restructure data + raw_data <- lapply(batch_names, function(batch_name) { + Matrix::t(adata$layers[["counts"]][batch == batch_name, , drop = FALSE]) + }) + names(raw_data) <- batch_names + + rliger::createLiger(rawData = raw_data, removeMissing = FALSE) +} + +addNormalizedDataToLiger <- function(adata, lobj) { + norm_data <- lapply(names(rliger::rawData(lobj)), function(name) { + norm <- adata$layers[["normalized"]] + + # subset + col_names <- colnames(rliger::rawData(lobj)[[name]]) + row_names <- rownames(rliger::rawData(lobj)[[name]]) + prefix <- paste0(name, "_") + col_names <- sub(prefix, "", col_names) + + norm <- norm[ + col_names, + row_names, + drop = FALSE + ] + + # add prefix + rownames(norm) <- paste0(prefix, rownames(norm)) + + # transpose + norm <- Matrix::t(norm) + + # turn into dgcMatrix + as(as(norm, "denseMatrix"), "CsparseMatrix") + }) + names(norm_data) <- names(rliger::rawData(lobj)) + + for (name in names(rliger::rawData(lobj))) { + lobj@datasets[[name]]@normData <- norm_data[[name]] + } + + lobj +} + +cat(">> Create Liger Data object\n") +lobj <- anndataToLiger(adata) + +cat(">> Normalize data\n") +lobj <- addNormalizedDataToLiger(adata, lobj) + +# could also use the rliger normalization instead +# lobj <- rliger::normalize(lobj) + +cat(">> Select genes\n") +# lobj <- rliger::selectGenes(lobj) +# overwrite gene selection to include all genes +lobj@varFeatures <- adata$var_names + +cat(">> Perform scaling\n") +lobj <- rliger::scaleNotCenter(lobj, removeMissing = FALSE) + +cat(">> Joint Matrix Factorization\n") +lobj <- rliger::runIntegration(lobj, k = 20) + +cat(">> Quantile normalization\n") +lobj <- rliger::quantileNorm(lobj) + +cat(">> Store output\n") +# remove dataset names from rownames +for (name in names(rliger::rawData(lobj))) { + rownames(lobj@H.norm) <- sub(paste0(name, "_"), "", rownames(lobj@H.norm)) +} + +output <- anndata::AnnData( + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + obsm = list( + X_emb = lobj@H.norm[rownames(adata), , drop = FALSE] + ), + shape = adata$shape +) + +cat(">> Write AnnData to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/methods/logistic_regression/config.vsh.yaml b/src/methods/logistic_regression/config.vsh.yaml deleted file mode 100644 index 479aa3aa..00000000 --- a/src/methods/logistic_regression/config.vsh.yaml +++ /dev/null @@ -1,79 +0,0 @@ -# The API specifies which type of component this is. -# It contains specifications for: -# - The input/output files -# - Common parameters -# - A unit test -__merge__: ../../api/comp_method.yaml - - -# A unique identifier for your component (required). -# Can contain only lowercase letters or underscores. -name: logistic_regression -# A relatively short label, used when rendering visualisations (required) -label: Logistic Regression -# A one sentence summary of how this method works (required). Used when -# rendering summary tables. -summary: "Logistic Regression with 100-dimensional PCA coordinates estimates parameters for multivariate classification by minimizing cross entropy loss over cell type classes." -# A multi-line description of how this component works (required). Used -# when rendering reference documentation. -description: | - Logistic Regression estimates parameters of a logistic function for - multivariate classification tasks. Here, we use 100-dimensional whitened PCA - coordinates as independent variables, and the model minimises the cross - entropy loss over all cell type classes. -# Metadata for your component -# A reference key from the bibtex library at src/common/library.bib (required). -references: - bibtex: - - | - @book{hosmer2013applied, - title = {Applied logistic regression}, - author = {Hosmer Jr, D.W. and Lemeshow, S. and Sturdivant, R.X.}, - year = {2013}, - publisher = {John Wiley \& Sons}, - volume = {398} - } - -links: - # URL to the code repository for this method (required). - repository: https://github.com/scikit-learn/scikit-learn - # URL to the documentation for this method (required). - documentation: "https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html" - -info: - # Which normalisation method this component prefers to use (required). - preferred_normalization: log_cp10k - -# Component-specific parameters (optional) -# arguments: -# - name: "--n_neighbors" -# type: "integer" -# default: 5 -# description: Number of neighbors to use. - -# Resources required to run the component -resources: - # The script of your component (required) - - type: python_script - path: script.py - # Additional resources your script needs (optional) - # - type: file - # path: weights.pt - -engines: - # Specifications for the Docker image for this component. - - type: docker - image: openproblems/base_python:1.0.0 - # Add custom dependencies here (optional). For more information, see - # https://viash.io/reference/config/engines/docker/#setup . - setup: - - type: python - packages: scikit-learn - -runners: - # This platform allows running the component natively - - type: executable - # Allows turning the component into a Nextflow module / pipeline. - - type: nextflow - directives: - label: [midtime, midmem, lowcpu] diff --git a/src/methods/logistic_regression/script.py b/src/methods/logistic_regression/script.py deleted file mode 100644 index cc851f8e..00000000 --- a/src/methods/logistic_regression/script.py +++ /dev/null @@ -1,46 +0,0 @@ -import anndata as ad -import sklearn.linear_model - -## VIASH START -# Note: this section is auto-generated by viash at runtime. To edit it, make changes -# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`. -par = { - 'input_train': 'resources_test/task_template/pancreas/train.h5ad', - 'input_test': 'resources_test/task_template/pancreas/test.h5ad', - 'output': 'output.h5ad' -} -meta = { - 'name': 'logistic_regression' -} -## VIASH END - -print('Reading input files', flush=True) -input_train = ad.read_h5ad(par['input_train']) -input_test = ad.read_h5ad(par['input_test']) - -print('Preprocess data', flush=True) -# ... preprocessing ... - -print('Train model', flush=True) -# ... train model ... -classifier = sklearn.linear_model.LogisticRegression() -classifier.fit(input_train.obsm["X_pca"], input_train.obs["label"].astype(str)) - -print('Generate predictions', flush=True) -# ... generate predictions ... -obs_label_pred = classifier.predict(input_test.obsm["X_pca"]) - -print("Write output AnnData to file", flush=True) -output = ad.AnnData( - uns={ - 'dataset_id': input_train.uns['dataset_id'], - 'normalization_id': input_train.uns['normalization_id'], - 'method_id': meta['name'] - }, - obs={ - 'label_pred': obs_label_pred - } -) -output.obs_names = input_test.obs_names - -output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/mnn_correct/config.vsh.yaml b/src/methods/mnn_correct/config.vsh.yaml new file mode 100644 index 00000000..79446a52 --- /dev/null +++ b/src/methods/mnn_correct/config.vsh.yaml @@ -0,0 +1,31 @@ +__merge__: /src/api/comp_method.yaml +name: mnn_correct +label: mnnCorrect +summary: Correct for batch effects in single-cell expression data using the mutual + nearest neighbors method. +description: | + We present a strategy for batch correction based on the detection of mutual nearest neighbors (MNNs) in the high-dimensional expression space. + Our approach does not rely on predefined or equal population compositions across batches; instead, it requires only that a subset of the population be shared between batches. +references: + doi: 10.1038/nbt.4091 +links: + repository: https://code.bioconductor.org/browse/batchelor/ + documentation: https://bioconductor.org/packages/batchelor/ +info: + method_types: [feature] + preferred_normalization: log_cp10k +resources: + - type: r_script + path: script.R +engines: + - type: docker + image: openproblems/base_r:1.0.0 + setup: + - type: r + bioc: + - batchelor +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, lowcpu, highmem] diff --git a/src/methods/mnn_correct/script.R b/src/methods/mnn_correct/script.R new file mode 100644 index 00000000..4f7ea0cf --- /dev/null +++ b/src/methods/mnn_correct/script.R @@ -0,0 +1,44 @@ +cat("Loading dependencies\n") +suppressPackageStartupMessages({ + requireNamespace("anndata", quietly = TRUE) + library(Matrix, warn.conflicts = FALSE) + requireNamespace("batchelor", quietly = TRUE) + library(SingleCellExperiment, warn.conflicts = FALSE) +}) +## VIASH START +par <- list( + input = 'resources_test/batch_integration/pancreas/dataset.h5ad', + output = 'output.h5ad' +) +meta <- list( + name = "mnn_correct_feature" +) +## VIASH END + +cat("Read input\n") +adata <- anndata::read_h5ad(par$input) + +cat("Run mnn\n") +out <- suppressWarnings(batchelor::mnnCorrect( + t(adata$layers[["normalized"]]), + batch = adata$obs[["batch"]] +)) + +cat("Reformat output\n") +layer <- SummarizedExperiment::assay(out, "corrected") + +cat("Store outputs\n") +output <- anndata::AnnData( + uns = list( + dataset_id = adata$uns[["dataset_id"]], + normalization_id = adata$uns[["normalization_id"]], + method_id = meta$name + ), + layers = list( + corrected_counts = as(t(layer), "sparseMatrix") + ), + shape = adata$shape +) + +cat("Write output to file\n") +zzz <- output$write_h5ad(par$output, compression = "gzip") diff --git a/src/methods/mnnpy/config.vsh.yaml b/src/methods/mnnpy/config.vsh.yaml new file mode 100644 index 00000000..759949ce --- /dev/null +++ b/src/methods/mnnpy/config.vsh.yaml @@ -0,0 +1,51 @@ +__merge__: /src/api/comp_method.yaml +name: mnnpy +label: mnnpy +summary: Batch effect correction by matching mutual nearest neighbors, Python implementation. +description: | + An implementation of MNN correct in python featuring low memory usage, full multicore support and compatibility with the scanpy framework. + + Batch effect correction by matching mutual nearest neighbors (Haghverdi et al, 2018) has been implemented as a function 'mnnCorrect' in the R package scran. Sadly it's extremely slow for big datasets and doesn't make full use of the parallel architecture of modern CPUs. + + This project is a python implementation of the MNN correct algorithm which takes advantage of python's extendability and hackability. It seamlessly integrates with the scanpy framework and has multicore support in its bones. +references: + doi: 10.1038/s41587-019-0113-3 +links: + repository: https://github.com/chriscainx/mnnpy + documentation: https://github.com/chriscainx/mnnpy#readme +info: + method_types: [feature] + preferred_normalization: log_cp10k + variants: + mnn_full_unscaled: + mnn_full_scaled: + preferred_normalization: log_cp10k_scaled +arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. +resources: + - type: python_script + path: script.py +engines: + - type: docker + image: python:3.8 + setup: + - type: apt + packages: + - procps + - type: python + pypi: + - anndata~=0.8.0 + - scanpy + - pyyaml + - requests + - jsonschema + github: + - chriscainx/mnnpy +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, lowcpu, lowmem] diff --git a/src/methods/mnnpy/script.py b/src/methods/mnnpy/script.py new file mode 100644 index 00000000..83808829 --- /dev/null +++ b/src/methods/mnnpy/script.py @@ -0,0 +1,55 @@ +import anndata as ad +import mnnpy + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, +} +meta = { + 'name': 'foo', + 'config': 'bar' +} +## VIASH END + +print('Read input', flush=True) +adata = ad.read_h5ad(par['input']) +adata.X = adata.layers['normalized'] +del adata.layers['normalized'] +del adata.layers['counts'] + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + +print('Run mnn', flush=True) +split = [] +batch_categories = adata.obs['batch'].cat.categories +for i in batch_categories: + split.append(adata[adata.obs['batch'] == i].copy()) +corrected, _, _ = mnnpy.mnn_correct( + *split, + batch_key='batch', + batch_categories=batch_categories, + index_unique=None + ) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + }, + layers={ + 'corrected_counts': corrected.X, + } +) + + +print("Store outputs", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/pyliger/config.vsh.yaml b/src/methods/pyliger/config.vsh.yaml new file mode 100644 index 00000000..09bb7d55 --- /dev/null +++ b/src/methods/pyliger/config.vsh.yaml @@ -0,0 +1,41 @@ +__merge__: /src/api/comp_method.yaml +name: pyliger +label: pyliger +summary: Python implementation of LIGER (Linked Inference of Genomic Experimental + Relationships +description: | + LIGER (installed as rliger) is a package for integrating and analyzing multiple + single-cell datasets, developed by the Macosko lab and maintained/extended by the + Welch lab. It relies on integrative non-negative matrix factorization to identify + shared and dataset-specific factors. +references: + doi: 10.1016/j.cell.2019.05.006 +links: + repository: https://github.com/welch-lab/pyliger + documentation: https://github.com/welch-lab/pyliger +info: + method_types: [embedding] + preferred_normalization: log_cp10k + variants: + liger_unscaled: + liger_scaled: + preferred_normalization: log_cp10k_scaled +resources: + - type: python_script + path: script.py + - type: python_script + path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - umap-learn[plot] + - pyliger + - dask-expr +runners: + - type: executable + - type: nextflow + directives: + label: [lowcpu, highmem, midtime] diff --git a/src/methods/pyliger/script.py b/src/methods/pyliger/script.py new file mode 100644 index 00000000..a0abd2ad --- /dev/null +++ b/src/methods/pyliger/script.py @@ -0,0 +1,86 @@ +import sys +import anndata as ad +import numpy as np +import pyliger + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', + 'output': 'output.h5ad' +} +meta = { + 'name': 'pyliger' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('>> Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) +adata.layers['norm_data'] = read_anndata(par['input'], X='layers/normalized').X + +print('>> Prepare data', flush=True) +adata_per_batch = [] +for batch in adata.obs['batch'].unique(): + adb = adata[adata.obs['batch'] == batch].copy() + + # save row sum and sum of squares for further use + norm_sum = np.ravel(np.sum(adb.layers["norm_data"], axis=0)) + norm_sum_sq = np.ravel(np.sum(adb.layers["norm_data"].power(2), axis=0)) + adb.var["norm_sum"] = norm_sum + adb.var["norm_sum_sq"] = norm_sum_sq + adb.var["norm_mean"] = norm_sum / adb.shape[0] + + # set more metadata + adb.obs.index.name = 'cell_barcode' + adb.var.index.name = 'gene_id' + adb.uns['sample_name'] = batch + + # append to list + adata_per_batch.append(adb) + +print('Create liger object', flush=True) +lobj = pyliger.create_liger( + adata_per_batch, + remove_missing=False +) + +# do not select genes +lobj.var_genes = adata.var_names + +print('>> Scaling', flush=True) +pyliger.scale_not_center(lobj, remove_missing=False) + +print('>> Optimize ALS', flush=True) +pyliger.optimize_ALS(lobj, k=20) + +print('>> Quantile normalization', flush=True) +pyliger.quantile_norm(lobj) + +print('>> Concatenate outputs', flush=True) +ad_out = ad.concat(lobj.adata_list) + +print('Store output', flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + 'X_emb': ad_out[adata.obs_names, :].obsm['H_norm'] + }, + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/scalex/config.vsh.yaml b/src/methods/scalex/config.vsh.yaml new file mode 100644 index 00000000..39a0d223 --- /dev/null +++ b/src/methods/scalex/config.vsh.yaml @@ -0,0 +1,43 @@ +__merge__: /src/api/comp_method.yaml +name: scalex +label: SCALEX +summary: Online single-cell data integration through projecting heterogeneous datasets + into a common cell-embedding space +description: | + SCALEX is a method for integrating heterogeneous single-cell data online using a VAE framework. Its generalised encoder disentangles batch-related components from batch-invariant biological components, which are then projected into a common cell-embedding space. +references: + doi: 10.1038/s41467-022-33758-z +links: + repository: https://github.com/jsxlei/SCALEX + documentation: https://scalex.readthedocs.io +info: + method_types: [feature, embedding] + preferred_normalization: log_cp10k + variants: + scalex_feature_unscaled: + scanorama_feature_scaled: + preferred_normalization: log_cp10k_scaled +arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. +resources: + - type: python_script + path: script.py + - type: python_script + path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scalex + - numpy<1.24 + - torch<2.1 +runners: + - type: executable + - type: nextflow + directives: + label: [lowmem, lowcpu, midtime] diff --git a/src/methods/scalex/script.py b/src/methods/scalex/script.py new file mode 100644 index 00000000..398412ed --- /dev/null +++ b/src/methods/scalex/script.py @@ -0,0 +1,69 @@ +import sys +import anndata as ad +import scalex + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'hvg': True, +} +meta = { + 'name' : 'foo', + 'config': 'bar' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + +print('Run SCALEX', flush=True) +adata = scalex.SCALEX( + adata, + batch_key="batch", + ignore_umap=True, + impute=adata.obs["batch"].cat.categories[0], + processed=True, + max_iteration=40, + min_features=None, + min_cells=None, + n_top_features=0, + outdir=None, + gpu=0, +) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + layers={ + 'corrected_counts': adata.layers["impute"], + }, + obsm={ + 'X_emb': adata.obsm['latent'], + }, + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + } +) + +print("Write output to file", flush=True) +output.write_h5ad(par['output'], compression='gzip') diff --git a/src/methods/scanorama/config.vsh.yaml b/src/methods/scanorama/config.vsh.yaml new file mode 100644 index 00000000..0e936e9e --- /dev/null +++ b/src/methods/scanorama/config.vsh.yaml @@ -0,0 +1,40 @@ +__merge__: /src/api/comp_method.yaml +name: scanorama +label: Scanorama +summary: Efficient integration of heterogeneous single-cell transcriptomes using Scanorama +description: | + "Scanorama is an extension of the MNN method. Other then MNN, it finds mutual nearest neighbours over all batches and embeds observations into a joint hyperplane." +references: + doi: 10.1038/s41587-019-0113-3 +links: + repository: https://github.com/brianhie/scanorama + documentation: https://github.com/brianhie/scanorama#readme +info: + method_types: [feature, embedding] + preferred_normalization: log_cp10k + variants: + scanorama_full_unscaled: + scanorama_full_scaled: + preferred_normalization: log_cp10k_scaled +arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. +resources: + - type: python_script + path: script.py + - type: python_script + path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scanorama +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu] diff --git a/src/methods/scanorama/script.py b/src/methods/scanorama/script.py new file mode 100644 index 00000000..8a8901f9 --- /dev/null +++ b/src/methods/scanorama/script.py @@ -0,0 +1,87 @@ +import sys +import anndata as ad +import scanorama + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, +} +meta = { + 'name': 'foo', + 'config': 'bar' +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +# based on scib +# -> https://github.com/theislab/scib/blob/59ae6eee5e611d9d3db067685ec96c28804e9127/scib/utils.py#L51C1-L72C62 +def merge_adata(*adata_list, **kwargs): + """Merge adatas from list while remove duplicated ``obs`` and ``var`` columns + + :param adata_list: ``anndata`` objects to be concatenated + :param kwargs: arguments to be passed to ``anndata.AnnData.concatenate`` + """ + + if len(adata_list) == 1: + return adata_list[0] + + # Make sure that adatas do not contain duplicate columns + for _adata in adata_list: + for attr in ("obs", "var"): + df = getattr(_adata, attr) + dup_mask = df.columns.duplicated() + if dup_mask.any(): + print( + f"Deleting duplicated keys `{list(df.columns[dup_mask].unique())}` from `adata.{attr}`." + ) + setattr(_adata, attr, df.loc[:, ~dup_mask]) + + return ad.AnnData.concatenate(*adata_list, **kwargs) + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/normalized', + obs='obs', + var='var', + uns='uns' +) + +if par['n_hvg']: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']] + adata = adata[:, idx].copy() + +print('Run scanorama', flush=True) +split = [] +batch_categories = adata.obs['batch'].cat.categories +for i in batch_categories: + split.append(adata[adata.obs['batch'] == i].copy()) +corrected = scanorama.correct_scanpy(split, return_dimred=True) +corrected = merge_adata(*corrected, batch_key='batch', batch_categories=batch_categories, index_unique=None) + +print("Store output", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + uns={ + 'dataset_id': adata.uns['dataset_id'], + 'normalization_id': adata.uns['normalization_id'], + 'method_id': meta['name'], + }, + layers={ + 'corrected_counts': corrected.X, + }, + obsm={ + 'X_emb': corrected.obsm["X_scanorama"], + } +) + +print("Write output to file", flush=True) +output.write(par['output'], compression='gzip') diff --git a/src/methods/scanvi/config.vsh.yaml b/src/methods/scanvi/config.vsh.yaml new file mode 100644 index 00000000..dcc7c06d --- /dev/null +++ b/src/methods/scanvi/config.vsh.yaml @@ -0,0 +1,61 @@ +__merge__: /src/api/comp_method.yaml +name: scanvi +label: scANVI +summary: scANVI is a deep learning method that considers cell type labels. +description: | + scANVI (single-cell ANnotation using Variational Inference; Python class SCANVI) is a semi-supervised model for single-cell transcriptomics data. In a sense, it can be seen as a scVI extension that can leverage the cell type knowledge for a subset of the cells present in the data sets to infer the states of the rest of the cells. +references: + doi: 10.1038/s41592-018-0229-2 +links: + repository: https://github.com/scverse/scvi-tools + documentation: https://docs.scvi-tools.org/en/stable/user_guide/models/scanvi.html +info: + method_types: [embedding] + preferred_normalization: counts + variants: + scanvi_full_unscaled: +arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + - name: --n_latent + type: integer + default: 30 + description: Number of latent dimensions. + - name: --n_hidden + type: integer + default: 128 + description: Number of hidden units. + - name: --n_layers + type: integer + default: 2 + description: Number of layers. + - name: --max_epochs_scvi + type: integer + example: 400 + description: Maximum number of training epochs for scVI. + - name: --max_epochs_scanvi + type: integer + example: 10 + description: Maximum number of training epochs for scANVI. +resources: + - type: python_script + path: script.py + - type: python_script + path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, lowmem, lowcpu, gpu] diff --git a/src/methods/scanvi/script.py b/src/methods/scanvi/script.py new file mode 100644 index 00000000..22c2f0f0 --- /dev/null +++ b/src/methods/scanvi/script.py @@ -0,0 +1,76 @@ +import sys +import anndata as ad +from scvi.model import SCVI, SCANVI + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, + 'n_latent': 30, + 'n_hidden': 128, + 'n_layers': 2, + 'max_epochs_scvi': 20, + 'max_epochs_scanvi': 20 +} +meta = { + 'name' : 'scanvi', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + adata = adata[:, idx].copy() + +print("Processing data", flush=True) +SCVI.setup_anndata(adata, batch_key="batch") + +print("Run scVI", flush=True) +model_kwargs = { + key: par[key] + for key in ["n_latent", "n_hidden", "n_layers"] + if par[key] is not None +} + +vae = SCVI(adata, **model_kwargs) + +vae.train(max_epochs=par["max_epochs_scvi"], train_size=1.0) + +print('Run SCANVI', flush=True) +scanvae = SCANVI.from_scvi_model( + scvi_model=vae, + labels_key="label", + unlabeled_category="UnknownUnknown", # pick anything definitely not in a dataset +) +scanvae.train(max_epochs=par["max_epochs_scanvi"], train_size=1.0) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": scanvae.get_latent_representation(), + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/methods/scvi/config.vsh.yaml b/src/methods/scvi/config.vsh.yaml new file mode 100644 index 00000000..4f833141 --- /dev/null +++ b/src/methods/scvi/config.vsh.yaml @@ -0,0 +1,57 @@ +__merge__: /src/api/comp_method.yaml +name: scvi +label: scVI +summary: scVI combines a variational autoencoder with a hierarchical Bayesian model. +description: | + scVI combines a variational autoencoder with a hierarchical Bayesian model. It uses the negative binomial distribution to describe gene expression of each cell, conditioned on unobserved factors and the batch variable. ScVI is run as implemented in Luecken et al. +references: + doi: 10.1038/s41592-018-0229-2 +links: + repository: https://github.com/scverse/scvi-tools + documentation: https://docs.scvi-tools.org/en/stable/user_guide/models/scvi.html +info: + method_types: [embedding] + preferred_normalization: counts + variants: + scvi_full_unscaled: +arguments: + - name: --n_hvg + type: integer + default: 2000 + description: Number of highly variable genes to use. + - name: --n_latent + type: integer + default: 30 + description: Number of latent dimensions. + - name: --n_hidden + type: integer + default: 128 + description: Number of hidden units. + - name: --n_layers + type: integer + default: 2 + description: Number of layers. + - name: --max_epochs + type: integer + example: 400 + description: Maximum number of epochs. +resources: + - type: python_script + path: script.py + - type: python_script + path: /src/utils/read_anndata_partial.py +engines: + - type: docker + image: openproblems/base_python:1.0.0 + setup: + - type: python + pypi: + - scvi-tools>=1.1.0 + - type: docker + run: | + pip install -U "jax[cuda12_pip]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +runners: + - type: executable + - type: nextflow + directives: + label: [midtime, midmem, lowcpu, gpu] diff --git a/src/methods/scvi/script.py b/src/methods/scvi/script.py new file mode 100644 index 00000000..d93d9ebb --- /dev/null +++ b/src/methods/scvi/script.py @@ -0,0 +1,66 @@ +import sys +import anndata as ad +from scvi.model import SCVI + +## VIASH START +par = { + 'input': 'resources_test/batch_integration/pancreas/dataset.h5ad', + 'output': 'output.h5ad', + 'n_hvg': 2000, + 'n_latent': 30, + 'n_hidden': 128, + 'n_layers': 2, + 'max_epochs': 400 +} +meta = { + 'name' : 'scvi', +} +## VIASH END + +sys.path.append(meta["resources_dir"]) +from read_anndata_partial import read_anndata + +print('Read input', flush=True) +adata = read_anndata( + par['input'], + X='layers/counts', + obs='obs', + var='var', + uns='uns' +) + +if par["n_hvg"]: + print(f"Select top {par['n_hvg']} high variable genes", flush=True) + idx = adata.var["hvg_score"].to_numpy().argsort()[::-1][:par["n_hvg"]] + adata = adata[:, idx].copy() + +print("Processing data", flush=True) +SCVI.setup_anndata(adata, batch_key="batch") + +print("Run scVI", flush=True) +model_kwargs = { + key: par[key] + for key in ["n_latent", "n_hidden", "n_layers"] + if par[key] is not None +} + +vae = SCVI(adata, **model_kwargs) + +vae.train(max_epochs=par["max_epochs"], train_size=1.0) + +print("Store outputs", flush=True) +output = ad.AnnData( + obs=adata.obs[[]], + var=adata.var[[]], + obsm={ + "X_emb": vae.get_latent_representation(), + }, + uns={ + "dataset_id": adata.uns["dataset_id"], + "normalization_id": adata.uns["normalization_id"], + "method_id": meta["name"], + }, +) + +print("Write output to file", flush=True) +output.write_h5ad(par["output"], compression="gzip") diff --git a/src/utils/read_anndata_partial.py b/src/utils/read_anndata_partial.py new file mode 100644 index 00000000..efbea059 --- /dev/null +++ b/src/utils/read_anndata_partial.py @@ -0,0 +1,77 @@ +import warnings +from pathlib import Path +import anndata as ad +import h5py +from scipy.sparse import csr_matrix +from anndata.experimental import read_elem, sparse_dataset + + +def read_anndata( + file: str, + backed: bool = False, + **kwargs +) -> ad.AnnData: + """ + Read anndata file + :param file: path to anndata file in h5ad format + :param kwargs: AnnData parameter to group mapping + """ + assert Path(file).exists(), f'File not found: {file}' + + f = h5py.File(file, 'r') + kwargs = {x: x for x in f} if not kwargs else kwargs + if len(f.keys()) == 0: + return ad.AnnData() + # check if keys are available + for name, slot in kwargs.items(): + if slot not in f: + warnings.warn( + f'Cannot find "{slot}" for AnnData parameter `{name}` from "{file}"' + ) + adata = read_partial(f, backed=backed, **kwargs) + if not backed: + f.close() + + return adata + + +def read_partial( + group: h5py.Group, + backed: bool = False, + force_sparse_types: [str, list] = None, + **kwargs +) -> ad.AnnData: + """ + Partially read h5py groups + :params group: file group + :params force_sparse_types: encoding types to convert to sparse_dataset via csr_matrix + :params backed: read sparse matrix as sparse_dataset + :params **kwargs: dict of slot_name: slot, by default use all available slot for the h5py file + :return: AnnData object + """ + if force_sparse_types is None: + force_sparse_types = [] + elif isinstance(force_sparse_types, str): + force_sparse_types = [force_sparse_types] + slots = {} + if backed: + print('Read as backed sparse matrix...') + + for slot_name, slot in kwargs.items(): + print(f'Read slot "{slot}", store as "{slot_name}"...') + if slot not in group: + warnings.warn(f'Slot "{slot}" not found, skip...') + slots[slot_name] = None + else: + elem = group[slot] + iospec = ad._io.specs.get_spec(elem) + if iospec.encoding_type in ("csr_matrix", "csc_matrix") and backed: + slots[slot_name] = sparse_dataset(elem) + elif iospec.encoding_type in force_sparse_types: + slots[slot_name] = csr_matrix(read_elem(elem)) + if backed: + slots[slot_name] = sparse_dataset(slots[slot_name]) + else: + slots[slot_name] = read_elem(elem) + return ad.AnnData(**slots) +