Skip to content

Commit

Permalink
migrate methods
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Sep 21, 2024
1 parent afb758c commit 1262c52
Show file tree
Hide file tree
Showing 25 changed files with 1,323 additions and 125 deletions.
52 changes: 52 additions & 0 deletions src/methods/bbknn/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
__merge__: /src/api/comp_method.yaml
name: bbknn
label: BBKNN
summary: BBKNN creates k nearest neighbours graph by identifying neighbours within
batches, then combining and processing them with UMAP for visualization.
description: |
"BBKNN or batch balanced k nearest neighbours graph is built for each cell by
identifying its k nearest neighbours within each defined batch separately,
creating independent neighbour sets for each cell in each batch. These sets
are then combined and processed with the UMAP algorithm for visualisation."
references:
doi: 10.1093/bioinformatics/btz625
links:
repository: https://github.com/Teichlab/bbknn
documentation: https://github.com/Teichlab/bbknn#readme
info:
method_types: [graph]
preferred_normalization: log_cp10k
variants:
bbknn_full_unscaled:
bbknn_full_scaled:
preferred_normalization: log_cp10k_scaled
arguments:
- name: --annoy_n_trees
type: integer
default: 10
description: Number of trees to use in the annoy forrest.
- name: --neighbors_within_batch
type: integer
default: 3
description: Number of neighbors to report within each batch.
- name: --n_hvg
type: integer
default: 2000
description: Number of highly variable genes to use.
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/utils/read_anndata_partial.py
engines:
- type: docker
image: openproblems/base_python:1.0.0
setup:
- type: python
pypi:
- bbknn
runners:
- type: executable
- type: nextflow
directives:
label: [midtime, midmem, lowcpu]
63 changes: 63 additions & 0 deletions src/methods/bbknn/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import sys
import anndata as ad
import scanpy as sc
import bbknn

## VIASH START
par = {
'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad',
'output': 'output.h5ad',
'annoy_n_trees': 10,
'neighbors_within_batch': 3,
'n_hvg': 2000,
}
meta = {
'name': 'foo',
'config': 'bar'
}
## VIASH END

sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata


print('Read input', flush=True)
adata = read_anndata(
par['input'],
X='layers/normalized',
obs='obs',
var='var',
uns='uns'
)

if par['n_hvg']:
print(f"Select top {par['n_hvg']} high variable genes", flush=True)
idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
adata = adata[:, idx].copy()
sc.pp.pca(adata)

print('Run BBKNN', flush=True)
kwargs = dict(batch_key='batch', copy=True)
kwargs['annoy_n_trees'] = par['annoy_n_trees']
kwargs['neighbors_within_batch'] = par['neighbors_within_batch']

ad_bbknn = bbknn.bbknn(adata, **kwargs)

print("Store output", flush=True)
output = ad.AnnData(
obs=adata.obs[[]],
var=adata.var[[]],
obsp={
'connectivities': ad_bbknn.obsp['connectivities'],
'distances': ad_bbknn.obsp['distances'],
},
uns={
'dataset_id': adata.uns['dataset_id'],
'normalization_id': adata.uns['normalization_id'],
'method_id': meta['name'],
'neighbors': ad_bbknn.uns['neighbors']
}
)

print("Store outputs", flush=True)
output.write_h5ad(par['output'], compression='gzip')
42 changes: 42 additions & 0 deletions src/methods/combat/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
__merge__: /src/api/comp_method.yaml
name: combat
label: Combat
summary: Adjusting batch effects in microarray expression data using empirical Bayes
methods
description: |
"An Empirical Bayes (EB) approach to correct for batch effects. It
estimates batch-specific parameters by pooling information across genes in
each batch and shrinks the estimates towards the overall mean of the batch
effect estimates across all genes. These parameters are then used to adjust
the data for batch effects, leading to more accurate and reproducible
results."
references:
doi: 10.1093/biostatistics/kxj037
links:
repository: https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html
documentation: https://scanpy.readthedocs.io/en/stable/api/scanpy.pp.combat.html
info:
method_types: [feature]
preferred_normalization: log_cp10k
variants:
combat_full_unscaled:
combat_full_scaled:
preferred_normalization: log_cp10k_scaled
arguments:
- name: --n_hvg
type: integer
default: 2000
description: Number of highly variable genes to use.
resources:
- type: python_script
path: script.py
- type: python_script
path: /src/utils/read_anndata_partial.py
engines:
- type: docker
image: openproblems/base_python:1.0.0
runners:
- type: executable
- type: nextflow
directives:
label: [midtime, highmem, lowcpu]
56 changes: 56 additions & 0 deletions src/methods/combat/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import sys
import scanpy as sc
from scipy.sparse import csr_matrix

## VIASH START
par = {
'input': 'resources_test/batch_integration/pancreas/unintegrated.h5ad',
'output': 'output.h5ad',
'n_hvg': 2000,
}

meta = {
'name': 'foo',
'config': 'bar'
}

## VIASH END

sys.path.append(meta["resources_dir"])
from read_anndata_partial import read_anndata

print('Read input', flush=True)
adata = read_anndata(
par['input'],
X='layers/normalized',
obs='obs',
var='var',
uns='uns'
)

if par['n_hvg']:
print(f"Select top {par['n_hvg']} high variable genes", flush=True)
idx = adata.var['hvg_score'].to_numpy().argsort()[::-1][:par['n_hvg']]
adata = adata[:, idx].copy()


print('Run Combat', flush=True)
adata.X = sc.pp.combat(adata, key='batch', inplace=False)


print("Store output", flush=True)
output = sc.AnnData(
obs=adata.obs[[]],
var=adata.var[[]],
uns={
'dataset_id': adata.uns['dataset_id'],
'normalization_id': adata.uns['normalization_id'],
'method_id': meta['name'],
},
layers={
'corrected_counts': csr_matrix(adata.X),
}
)

print("Store outputs", flush=True)
output.write_h5ad(par['output'], compression='gzip')
34 changes: 34 additions & 0 deletions src/methods/fastmnn/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
__merge__: /src/api/comp_method.yaml
name: fastmnn
label: fastMnn
summary: A simpler version of the original mnnCorrect algorithm.
description: |
The fastMNN() approach is much simpler than the original mnnCorrect() algorithm, and proceeds in several steps.
1. Perform a multi-sample PCA on the (cosine-)normalized expression values to reduce dimensionality.
2. Identify MNN pairs in the low-dimensional space between a reference batch and a target batch.
3. Remove variation along the average batch vector in both reference and target batches.
4. Correct the cells in the target batch towards the reference, using locally weighted correction vectors.
5. Merge the corrected target batch with the reference, and repeat with the next target batch.
references:
doi: 10.1038/nbt.4091
links:
repository: https://code.bioconductor.org/browse/batchelor/
documentation: https://bioconductor.org/packages/batchelor/
info:
method_types: [feature, embedding]
preferred_normalization: log_cp10k
resources:
- type: r_script
path: script.R
engines:
- type: docker
image: openproblems/base_r:1.0.0
setup:
- type: r
bioc: batchelor
runners:
- type: executable
- type: nextflow
directives:
label: [midtime, lowcpu, highmem]
50 changes: 50 additions & 0 deletions src/methods/fastmnn/script.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
cat("Loading dependencies\n")
suppressPackageStartupMessages({
requireNamespace("anndata", quietly = TRUE)
library(Matrix, warn.conflicts = FALSE)
requireNamespace("batchelor", quietly = TRUE)
library(SingleCellExperiment, warn.conflicts = FALSE)
})

## VIASH START
par <- list(
input = 'resources_test/batch_integration/pancreas/unintegrated.h5ad',
output = 'output.h5ad'
)
meta <- list(
name = "mnn_correct_feature"
)
## VIASH END

cat("Read input\n")
adata <- anndata::read_h5ad(par$input)

# TODO: pass output of 'multiBatchNorm' to fastMNN

cat("Run mnn\n")
out <- suppressWarnings(batchelor::fastMNN(
t(adata$layers[["normalized"]]),
batch = adata$obs[["batch"]]
))

layer <- as(SummarizedExperiment::assay(out, "reconstructed"), "sparseMatrix")
obsm <- SingleCellExperiment::reducedDim(out, "corrected")

cat("Reformat output\n")
output <- anndata::AnnData(
layers = list(
corrected_counts = t(layer)
),
obsm = list(
X_emb = obsm
),
shape = adata$shape,
uns = list(
dataset_id = adata$uns[["dataset_id"]],
normalization_id = adata$uns[["normalization_id"]],
method_id = meta$name
)
)

cat("Write output to file\n")
zzz <- output$write_h5ad(par$output, compression = "gzip")
34 changes: 34 additions & 0 deletions src/methods/liger/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
__merge__: /src/api/comp_method.yaml
name: liger
label: LIGER
summary: Linked Inference of Genomic Experimental Relationships
description: |
LIGER or linked inference of genomic experimental relationships uses iNMF
deriving and implementing a novel coordinate descent algorithm to efficiently
do the factorization. Joint clustering is performed and factor loadings are
normalised.
references:
doi: 10.1016/j.cell.2019.05.006
links:
repository: https://github.com/welch-lab/liger
documentation: https://github.com/welch-lab/liger
info:
method_types: [embedding]
preferred_normalization: log_cp10k
resources:
- type: r_script
path: script.R
engines:
- type: docker
image: openproblems/base_r:1.0.0
setup:
- type: apt
packages: cmake
- type: r
cran: rliger
github: welch-lab/RcppPlanc
runners:
- type: executable
- type: nextflow
directives:
label: [lowcpu, highmem, midtime]
Loading

0 comments on commit 1262c52

Please sign in to comment.