diff --git a/src/control_methods/no_integration_batch/script.py b/src/control_methods/no_integration_batch/script.py index 34f446b1..0c56292f 100644 --- a/src/control_methods/no_integration_batch/script.py +++ b/src/control_methods/no_integration_batch/script.py @@ -24,7 +24,6 @@ adata = read_anndata( par["input_dataset"], X="layers/normalized", obs="obs", var="var", uns="uns" ) -adata.var["highly_variable"] = adata.var["hvg"] print(adata, flush=True) print("\n>>> Calculating PCA by batch...", flush=True) @@ -47,8 +46,8 @@ adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca( adata[batch_idx].copy(), n_comps=n_comps, - use_highly_variable=True, svd_solver=solver, + mask_var=None, copy=True, ).obsm["X_pca"] diff --git a/src/data_processors/process_dataset/script.py b/src/data_processors/process_dataset/script.py index 71e680a3..b87aff26 100644 --- a/src/data_processors/process_dataset/script.py +++ b/src/data_processors/process_dataset/script.py @@ -1,7 +1,10 @@ import sys + import anndata as ad -import scanpy as sc +import numpy as np import openproblems as op +import scanpy as sc +import scib ## VIASH START par = { @@ -9,11 +12,11 @@ "hvgs": 2000, "obs_label": "cell_type", "obs_batch": "batch", - "output": "output.h5ad" + "output": "output.h5ad", } meta = { "config": "target/nextflow/data_processors/process_dataset/.config.vsh.yaml", - "resources_dir": "target/nextflow/data_processors/process_dataset" + "resources_dir": "target/nextflow/data_processors/process_dataset", } ## VIASH END @@ -21,83 +24,85 @@ sys.path.append(meta["resources_dir"]) from subset_h5ad_by_format import subset_h5ad_by_format -print(">> Load config", flush=True) -config = op.project.read_viash_config(meta["config"]) - -print("Read input", flush=True) -adata = ad.read_h5ad(par["input"]) +# ----------- FUNCTIONS ----------- def compute_batched_hvg(adata, n_hvgs): if n_hvgs > adata.n_vars or n_hvgs <= 0: hvg_list = adata.var_names.tolist() else: - import scib scib_adata = adata.copy() del scib_adata.layers["counts"] scib_adata.X = scib_adata.layers["normalized"].copy() hvg_list = scib.pp.hvg_batch( - scib_adata, - batch_key="batch", - target_genes=n_hvgs, - adataOut=False + scib_adata, batch_key="batch", target_genes=n_hvgs, adataOut=False ) return hvg_list -n_hvgs = par["hvgs"] -n_components = adata.obsm["X_pca"].shape[1] -if adata.n_vars < n_hvgs: - n_hvgs = adata.n_vars +# ----------- SCRIPT ----------- +print( + f"====== Process dataset (scanpy v{sc.__version__}, scib v{scib.__version__}) ======", + flush=True, +) -if adata.n_vars > n_hvgs: - print(f"Select {par['hvgs']} highly variable genes", flush=True) - hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs) +print("\n>>> Loading config...", flush=True) +config = op.project.read_viash_config(meta["config"]) - print("Subsetting to HVG dimensions", flush=True) - adata = adata[:, adata.var_names.isin(hvg_list)].copy() +print("\n>>> Reading input files...", flush=True) +print(f"Input H5AD file: '{par['input']}'", flush=True) +adata = ad.read_h5ad(par["input"]) +print(adata, flush=True) -print(">> Recompute HVG", flush=True) -out = sc.pp.highly_variable_genes( - adata, - layer="normalized", - n_top_genes=n_hvgs, - flavor="cell_ranger", - inplace=False -) -adata.var["hvg"] = out["highly_variable"].values -adata.var["hvg_score"] = out["dispersions_norm"].values +n_hvgs = par["hvgs"] +if adata.n_vars < n_hvgs: + print(f"\n>>> Using all {adata.n_vars} features as HVGs...", flush=True) + n_hvgs = adata.n_vars + hvg_list = adata.var_names.tolist() +else: + print(f"\n>>> Computing {n_hvgs} batch-aware HVGs...", flush=True) + hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs) -print(">> Recompute PCA", flush=True) +n_components = adata.obsm["X_pca"].shape[1] +print(f"\n>>> Computing PCA with {n_components} components using HVGs...", flush=True) +hvg_mask = adata.var_names.isin(hvg_list) X_pca, loadings, variance, variance_ratio = sc.pp.pca( - adata.layers["normalized"], + adata.layers["normalized"], n_comps=n_components, - return_info=True + mask_var=hvg_mask, + return_info=True, ) adata.obsm["X_pca"] = X_pca -adata.varm["pca_loadings"] = loadings.T -adata.uns["pca_variance"] = { - "variance": variance, - "variance_ratio": variance_ratio -} +adata.varm["pca_loadings"] = np.zeros(shape=(adata.n_vars, n_components)) +adata.varm["pca_loadings"][hvg_mask, :] = loadings.T +adata.uns["pca_variance"] = {"variance": variance, "variance_ratio": variance_ratio} -print(">> Recompute neighbors", flush=True) +print("\n>>> Computing neighbours using PCA...", flush=True) del adata.uns["knn"] del adata.obsp["knn_connectivities"] del adata.obsp["knn_distances"] sc.pp.neighbors(adata, use_rep="X_pca", n_neighbors=30, key_added="knn") -print(">> Create output object", flush=True) -output_dataset = subset_h5ad_by_format( - adata, - config, - "output_dataset" +print("\n>>> Recomputing standard HVG scores...", flush=True) +out = sc.pp.highly_variable_genes( + adata, layer="normalized", n_top_genes=n_hvgs, flavor="cell_ranger", inplace=False ) -output_solution = subset_h5ad_by_format( - adata, - config, - "output_solution" +adata.var["hvg"] = out["highly_variable"].values +adata.var["hvg_score"] = out["dispersions_norm"].values + +print("\n>>> Creating dataset output object...", flush=True) +output_dataset = subset_h5ad_by_format( + adata[:, adata.var_names.isin(hvg_list)].copy(), config, "output_dataset" ) +print(output_dataset, flush=True) -print("Writing adatas to file", flush=True) +print("\n>>> Creating solution output object...", flush=True) +output_solution = subset_h5ad_by_format(adata, config, "output_solution") +print(output_solution, flush=True) + +print("\n>>> Writing output to files...", flush=True) +print(f"Dataset H5AD file: '{par['output_dataset']}'", flush=True) output_dataset.write_h5ad(par["output_dataset"], compression="gzip") +print(f"Solution H5AD file: '{par['output_solution']}'", flush=True) output_solution.write_h5ad(par["output_solution"], compression="gzip") + +print("\n>>> Done!", flush=True) diff --git a/src/metrics/hvg_overlap/script.py b/src/metrics/hvg_overlap/script.py index 2db0f9b8..b8e2122b 100644 --- a/src/metrics/hvg_overlap/script.py +++ b/src/metrics/hvg_overlap/script.py @@ -51,7 +51,11 @@ adata_integrated = adata_integrated[~adata_integrated.obs["batch"].isin(skip_batches)] print("Compute score", flush=True) -score = hvg_overlap(adata_solution, adata_integrated, batch_key="batch") +score = hvg_overlap( + adata_solution[:, adata_solution.var_names.isin(adata_integrated.var_names)], + adata_integrated, + batch_key="batch" +) print("Create output AnnData object", flush=True) output = ad.AnnData( diff --git a/src/metrics/pcr/script.py b/src/metrics/pcr/script.py index 0ae18ddb..95656d23 100644 --- a/src/metrics/pcr/script.py +++ b/src/metrics/pcr/script.py @@ -29,6 +29,7 @@ ) adata_integrated = read_anndata( par['input_integrated'], + var='var', obs='obs', obsm='obsm', uns='uns' @@ -39,11 +40,11 @@ print('compute score', flush=True) score = pcr_comparison( - adata_solution, + adata_solution[:, adata_solution.var_names.isin(adata_integrated.var_names)], adata_integrated, embed='X_emb', covariate='batch', - verbose=False + verbose=True ) print('Create output AnnData object', flush=True)