migrate methods

openproblems-bio · Sep 21, 2024 · 1262c52 · 1262c52
1 parent afb758c
commit 1262c52
Show file tree

Hide file tree

Showing 25 changed files with 1,323 additions and 125 deletions.
diff --git a/src/methods/bbknn/config.vsh.yaml b/src/methods/bbknn/config.vsh.yaml
@@ -0,0 +1,52 @@
+__merge__: /src/api/comp_method.yaml
+name: bbknn
+label: BBKNN
+summary: BBKNN creates k nearest neighbours graph by identifying neighbours within
+  batches, then combining and processing them with UMAP for visualization.
+description: |
+  "BBKNN or batch balanced k nearest neighbours graph is built for each cell by
+  identifying its k nearest neighbours within each defined batch separately,
+  creating independent neighbour sets for each cell in each batch. These sets
+  are then combined and processed with the UMAP algorithm for visualisation."
+references:
+  doi: 10.1093/bioinformatics/btz625
+links:
+  repository: https://github.com/Teichlab/bbknn
+  documentation: https://github.com/Teichlab/bbknn#readme
+info:
+  method_types: [graph]
+  preferred_normalization: log_cp10k
+  variants:
+    bbknn_full_unscaled:
+    bbknn_full_scaled:
+      preferred_normalization: log_cp10k_scaled
+arguments:
+  - name: --annoy_n_trees
+    type: integer
+    default: 10
+    description: Number of trees to use in the annoy forrest.
+  - name: --neighbors_within_batch
+    type: integer
+    default: 3
+    description: Number of neighbors to report within each batch.
+  - name: --n_hvg
+    type: integer
+    default: 2000
+    description: Number of highly variable genes to use.
+resources:
+  - type: python_script
+    path: script.py
+  - type: python_script
+    path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - bbknn
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/methods/bbknn/script.py b/src/methods/bbknn/script.py
@@ -0,0 +1,63 @@
+import sys
+import anndata as ad
+import scanpy as sc
+import bbknn
+
+## VIASH START
+par = {
+    'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad',
+    'output': 'output.h5ad',
+    'annoy_n_trees': 10,
+    'neighbors_within_batch': 3,
+    'n_hvg': 2000,
+}
+meta = {
+    'name': 'foo',
+    'config': 'bar'
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+
+print('Read input', flush=True)
+adata = read_anndata(
+    par['input'],
+    X='layers/normalized',
+    obs='obs',
+    var='var',
+    uns='uns'
+)
+
+if par['n_hvg']:
+    print(f"Select top {par['n_hvg']} high variable genes", flush=True)
+    idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
+    adata = adata[:, idx].copy()
+    sc.pp.pca(adata)
+
+print('Run BBKNN', flush=True)
+kwargs = dict(batch_key='batch', copy=True)
+kwargs['annoy_n_trees'] = par['annoy_n_trees']
+kwargs['neighbors_within_batch'] = par['neighbors_within_batch']
+
+ad_bbknn = bbknn.bbknn(adata, **kwargs)
+
+print("Store output", flush=True)
+output = ad.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    obsp={
+        'connectivities': ad_bbknn.obsp['connectivities'],
+        'distances': ad_bbknn.obsp['distances'],
+    },
+    uns={
+        'dataset_id': adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        'method_id': meta['name'],
+        'neighbors': ad_bbknn.uns['neighbors']
+    }
+)
+
+print("Store outputs", flush=True)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/methods/combat/config.vsh.yaml b/src/methods/combat/config.vsh.yaml
@@ -0,0 +1,42 @@
+__merge__: /src/api/comp_method.yaml
+name: combat
+label: Combat
+summary: Adjusting batch effects in microarray expression data using empirical Bayes
+  methods
+description: |
+  "An Empirical Bayes (EB) approach to correct for batch effects. It
+  estimates batch-specific parameters by pooling information across genes in
+  each batch and shrinks the estimates towards the overall mean of the batch
+  effect estimates across all genes. These parameters are then used to adjust
+  the data for batch effects, leading to more accurate and reproducible
+  results."
+references:
+  doi: 10.1093/biostatistics/kxj037
+links:
+  repository: https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html
+  documentation: https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html
+info:
+  method_types: [feature]
+  preferred_normalization: log_cp10k
+  variants:
+    combat_full_unscaled:
+    combat_full_scaled:
+      preferred_normalization: log_cp10k_scaled
+arguments:
+  - name: --n_hvg
+    type: integer
+    default: 2000
+    description: Number of highly variable genes to use.
+resources:
+  - type: python_script
+    path: script.py
+  - type: python_script
+    path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, highmem, lowcpu]
diff --git a/src/methods/combat/script.py b/src/methods/combat/script.py
@@ -0,0 +1,56 @@
+import sys
+import scanpy as sc
+from scipy.sparse import csr_matrix
+
+## VIASH START
+par = {
+    'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad',
+    'output': 'output.h5ad',
+    'n_hvg': 2000,
+}
+
+meta = {
+    'name': 'foo',
+    'config': 'bar'
+}
+
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+print('Read input', flush=True)
+adata = read_anndata(
+    par['input'],
+    X='layers/normalized',
+    obs='obs',
+    var='var',
+    uns='uns'
+)
+
+if par['n_hvg']:
+    print(f"Select top {par['n_hvg']} high variable genes", flush=True)
+    idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
+    adata = adata[:, idx].copy()
+
+
+print('Run Combat', flush=True)
+adata.X = sc.pp.combat(adata, key='batch', inplace=False)
+
+
+print("Store output", flush=True)
+output = sc.AnnData(
+    obs=adata.obs[[]],
+    var=adata.var[[]],
+    uns={
+        'dataset_id': adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        'method_id': meta['name'],
+    },
+    layers={
+        'corrected_counts': csr_matrix(adata.X),
+    }
+)
+
+print("Store outputs", flush=True)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/methods/fastmnn/config.vsh.yaml b/src/methods/fastmnn/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: /src/api/comp_method.yaml
+name: fastmnn
+label: fastMnn
+summary: A simpler version of the original mnnCorrect algorithm.
+description: |
+  The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps.
+
+  1. Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality.
+  2. Identify MNN pairs in the low-dimensional space between a reference batch and a target batch.
+  3. Remove variation along the average batch vector in both reference and target batches.
+  4. Correct the cells in the target batch towards the reference, using locally weighted correction vectors.
+  5. Merge the corrected target batch with the reference, and repeat with the next target batch.
+references:
+  doi: 10.1038/nbt.4091
+links:
+  repository: https://code.bioconductor.org/browse/batchelor/
+  documentation: https://bioconductor.org/packages/batchelor/
+info:
+  method_types: [feature, embedding]
+  preferred_normalization: log_cp10k
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1.0.0
+    setup:
+      - type: r
+        bioc: batchelor
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, lowcpu, highmem]
diff --git a/src/methods/fastmnn/script.R b/src/methods/fastmnn/script.R
@@ -0,0 +1,50 @@
+cat("Loading dependencies\n")
+suppressPackageStartupMessages({
+  requireNamespace("anndata", quietly = TRUE)
+  library(Matrix, warn.conflicts = FALSE)
+  requireNamespace("batchelor", quietly = TRUE)
+  library(SingleCellExperiment, warn.conflicts = FALSE)
+})
+
+## VIASH START
+par <- list(
+  input = 'resources_test/batch_integration/pancreas/unintegrated.h5ad',
+  output = 'output.h5ad'
+)
+meta <- list(
+  name = "mnn_correct_feature"
+)
+## VIASH END
+
+cat("Read input\n")
+adata <- anndata::read_h5ad(par$input)
+
+# TODO: pass output of 'multiBatchNorm' to fastMNN
+
+cat("Run mnn\n")
+out <- suppressWarnings(batchelor::fastMNN(
+  t(adata$layers[["normalized"]]),
+  batch = adata$obs[["batch"]]
+))
+
+layer <- as(SummarizedExperiment::assay(out, "reconstructed"), "sparseMatrix")
+obsm <- SingleCellExperiment::reducedDim(out, "corrected")
+
+cat("Reformat output\n")
+output <- anndata::AnnData(
+  layers = list(
+    corrected_counts = t(layer)
+  ),
+  obsm = list(
+    X_emb = obsm
+  ),
+  shape = adata$shape,
+  uns = list(
+    dataset_id = adata$uns[["dataset_id"]],
+    normalization_id = adata$uns[["normalization_id"]],
+    method_id = meta$name
+  )
+)
+
+cat("Write output to file\n")
+zzz <- output$write_h5ad(par$output, compression = "gzip")
diff --git a/src/methods/liger/config.vsh.yaml b/src/methods/liger/config.vsh.yaml
@@ -0,0 +1,34 @@
+__merge__: /src/api/comp_method.yaml
+name: liger
+label: LIGER
+summary: Linked Inference of Genomic Experimental Relationships
+description: |
+  LIGER or linked inference of genomic experimental relationships uses iNMF 
+  deriving and implementing a novel coordinate descent algorithm to efficiently 
+  do the factorization. Joint clustering is performed and factor loadings are 
+  normalised.
+references:
+  doi: 10.1016/j.cell.2019.05.006
+links:
+  repository: https://github.com/welch-lab/liger
+  documentation: https://github.com/welch-lab/liger
+info:
+  method_types: [embedding]
+  preferred_normalization: log_cp10k
+resources:
+  - type: r_script
+    path: script.R
+engines:
+  - type: docker
+    image: openproblems/base_r:1.0.0
+    setup:
+      - type: apt
+        packages: cmake
+      - type: r
+        cran: rliger
+        github: welch-lab/RcppPlanc
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [lowcpu, highmem, midtime]