generated from openproblems-bio/task_template
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Include all features in solution (#28)
- Loading branch information
Showing
4 changed files
with
65 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,103 +1,108 @@ | ||
import sys | ||
|
||
import anndata as ad | ||
import scanpy as sc | ||
import numpy as np | ||
import openproblems as op | ||
import scanpy as sc | ||
import scib | ||
|
||
## VIASH START | ||
par = { | ||
"input": "resources_test/common/pancreas/dataset.h5ad", | ||
"hvgs": 2000, | ||
"obs_label": "cell_type", | ||
"obs_batch": "batch", | ||
"output": "output.h5ad" | ||
"output": "output.h5ad", | ||
} | ||
meta = { | ||
"config": "target/nextflow/data_processors/process_dataset/.config.vsh.yaml", | ||
"resources_dir": "target/nextflow/data_processors/process_dataset" | ||
"resources_dir": "target/nextflow/data_processors/process_dataset", | ||
} | ||
## VIASH END | ||
|
||
# import helper functions | ||
sys.path.append(meta["resources_dir"]) | ||
from subset_h5ad_by_format import subset_h5ad_by_format | ||
|
||
print(">> Load config", flush=True) | ||
config = op.project.read_viash_config(meta["config"]) | ||
|
||
print("Read input", flush=True) | ||
adata = ad.read_h5ad(par["input"]) | ||
|
||
# ----------- FUNCTIONS ----------- | ||
def compute_batched_hvg(adata, n_hvgs): | ||
if n_hvgs > adata.n_vars or n_hvgs <= 0: | ||
hvg_list = adata.var_names.tolist() | ||
else: | ||
import scib | ||
scib_adata = adata.copy() | ||
del scib_adata.layers["counts"] | ||
scib_adata.X = scib_adata.layers["normalized"].copy() | ||
hvg_list = scib.pp.hvg_batch( | ||
scib_adata, | ||
batch_key="batch", | ||
target_genes=n_hvgs, | ||
adataOut=False | ||
scib_adata, batch_key="batch", target_genes=n_hvgs, adataOut=False | ||
) | ||
return hvg_list | ||
|
||
n_hvgs = par["hvgs"] | ||
n_components = adata.obsm["X_pca"].shape[1] | ||
|
||
if adata.n_vars < n_hvgs: | ||
n_hvgs = adata.n_vars | ||
# ----------- SCRIPT ----------- | ||
print( | ||
f"====== Process dataset (scanpy v{sc.__version__}, scib v{scib.__version__}) ======", | ||
flush=True, | ||
) | ||
|
||
if adata.n_vars > n_hvgs: | ||
print(f"Select {par['hvgs']} highly variable genes", flush=True) | ||
hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs) | ||
print("\n>>> Loading config...", flush=True) | ||
config = op.project.read_viash_config(meta["config"]) | ||
|
||
print("Subsetting to HVG dimensions", flush=True) | ||
adata = adata[:, adata.var_names.isin(hvg_list)].copy() | ||
print("\n>>> Reading input files...", flush=True) | ||
print(f"Input H5AD file: '{par['input']}'", flush=True) | ||
adata = ad.read_h5ad(par["input"]) | ||
print(adata, flush=True) | ||
|
||
print(">> Recompute HVG", flush=True) | ||
out = sc.pp.highly_variable_genes( | ||
adata, | ||
layer="normalized", | ||
n_top_genes=n_hvgs, | ||
flavor="cell_ranger", | ||
inplace=False | ||
) | ||
adata.var["hvg"] = out["highly_variable"].values | ||
adata.var["hvg_score"] = out["dispersions_norm"].values | ||
n_hvgs = par["hvgs"] | ||
if adata.n_vars < n_hvgs: | ||
print(f"\n>>> Using all {adata.n_vars} features as HVGs...", flush=True) | ||
n_hvgs = adata.n_vars | ||
hvg_list = adata.var_names.tolist() | ||
else: | ||
print(f"\n>>> Computing {n_hvgs} batch-aware HVGs...", flush=True) | ||
hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs) | ||
|
||
print(">> Recompute PCA", flush=True) | ||
n_components = adata.obsm["X_pca"].shape[1] | ||
print(f"\n>>> Computing PCA with {n_components} components using HVGs...", flush=True) | ||
hvg_mask = adata.var_names.isin(hvg_list) | ||
X_pca, loadings, variance, variance_ratio = sc.pp.pca( | ||
adata.layers["normalized"], | ||
adata.layers["normalized"], | ||
n_comps=n_components, | ||
return_info=True | ||
mask_var=hvg_mask, | ||
return_info=True, | ||
) | ||
adata.obsm["X_pca"] = X_pca | ||
adata.varm["pca_loadings"] = loadings.T | ||
adata.uns["pca_variance"] = { | ||
"variance": variance, | ||
"variance_ratio": variance_ratio | ||
} | ||
adata.varm["pca_loadings"] = np.zeros(shape=(adata.n_vars, n_components)) | ||
adata.varm["pca_loadings"][hvg_mask, :] = loadings.T | ||
adata.uns["pca_variance"] = {"variance": variance, "variance_ratio": variance_ratio} | ||
|
||
print(">> Recompute neighbors", flush=True) | ||
print("\n>>> Computing neighbours using PCA...", flush=True) | ||
del adata.uns["knn"] | ||
del adata.obsp["knn_connectivities"] | ||
del adata.obsp["knn_distances"] | ||
sc.pp.neighbors(adata, use_rep="X_pca", n_neighbors=30, key_added="knn") | ||
|
||
print(">> Create output object", flush=True) | ||
output_dataset = subset_h5ad_by_format( | ||
adata, | ||
config, | ||
"output_dataset" | ||
print("\n>>> Recomputing standard HVG scores...", flush=True) | ||
out = sc.pp.highly_variable_genes( | ||
adata, layer="normalized", n_top_genes=n_hvgs, flavor="cell_ranger", inplace=False | ||
) | ||
output_solution = subset_h5ad_by_format( | ||
adata, | ||
config, | ||
"output_solution" | ||
adata.var["hvg"] = out["highly_variable"].values | ||
adata.var["hvg_score"] = out["dispersions_norm"].values | ||
|
||
print("\n>>> Creating dataset output object...", flush=True) | ||
output_dataset = subset_h5ad_by_format( | ||
adata[:, adata.var_names.isin(hvg_list)].copy(), config, "output_dataset" | ||
) | ||
print(output_dataset, flush=True) | ||
|
||
print("Writing adatas to file", flush=True) | ||
print("\n>>> Creating solution output object...", flush=True) | ||
output_solution = subset_h5ad_by_format(adata, config, "output_solution") | ||
print(output_solution, flush=True) | ||
|
||
print("\n>>> Writing output to files...", flush=True) | ||
print(f"Dataset H5AD file: '{par['output_dataset']}'", flush=True) | ||
output_dataset.write_h5ad(par["output_dataset"], compression="gzip") | ||
print(f"Solution H5AD file: '{par['output_solution']}'", flush=True) | ||
output_solution.write_h5ad(par["output_solution"], compression="gzip") | ||
|
||
print("\n>>> Done!", flush=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters