Skip to content

Commit

Permalink
Include all features in solution (#28)
Browse files Browse the repository at this point in the history
  • Loading branch information
lazappi authored Jan 15, 2025
1 parent 7e3c679 commit 83da5db
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 56 deletions.
3 changes: 1 addition & 2 deletions src/control_methods/no_integration_batch/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
adata = read_anndata(
par["input_dataset"], X="layers/normalized", obs="obs", var="var", uns="uns"
)
adata.var["highly_variable"] = adata.var["hvg"]
print(adata, flush=True)

print("\n>>> Calculating PCA by batch...", flush=True)
Expand All @@ -47,8 +46,8 @@
adata.obsm["X_emb"][batch_idx, :n_comps] = sc.tl.pca(
adata[batch_idx].copy(),
n_comps=n_comps,
use_highly_variable=True,
svd_solver=solver,
mask_var=None,
copy=True,
).obsm["X_pca"]

Expand Down
107 changes: 56 additions & 51 deletions src/data_processors/process_dataset/script.py
Original file line number Diff line number Diff line change
@@ -1,103 +1,108 @@
import sys

import anndata as ad
import scanpy as sc
import numpy as np
import openproblems as op
import scanpy as sc
import scib

## VIASH START
par = {
"input": "resources_test/common/pancreas/dataset.h5ad",
"hvgs": 2000,
"obs_label": "cell_type",
"obs_batch": "batch",
"output": "output.h5ad"
"output": "output.h5ad",
}
meta = {
"config": "target/nextflow/data_processors/process_dataset/.config.vsh.yaml",
"resources_dir": "target/nextflow/data_processors/process_dataset"
"resources_dir": "target/nextflow/data_processors/process_dataset",
}
## VIASH END

# import helper functions
sys.path.append(meta["resources_dir"])
from subset_h5ad_by_format import subset_h5ad_by_format

print(">> Load config", flush=True)
config = op.project.read_viash_config(meta["config"])

print("Read input", flush=True)
adata = ad.read_h5ad(par["input"])

# ----------- FUNCTIONS -----------
def compute_batched_hvg(adata, n_hvgs):
if n_hvgs > adata.n_vars or n_hvgs <= 0:
hvg_list = adata.var_names.tolist()
else:
import scib
scib_adata = adata.copy()
del scib_adata.layers["counts"]
scib_adata.X = scib_adata.layers["normalized"].copy()
hvg_list = scib.pp.hvg_batch(
scib_adata,
batch_key="batch",
target_genes=n_hvgs,
adataOut=False
scib_adata, batch_key="batch", target_genes=n_hvgs, adataOut=False
)
return hvg_list

n_hvgs = par["hvgs"]
n_components = adata.obsm["X_pca"].shape[1]

if adata.n_vars < n_hvgs:
n_hvgs = adata.n_vars
# ----------- SCRIPT -----------
print(
f"====== Process dataset (scanpy v{sc.__version__}, scib v{scib.__version__}) ======",
flush=True,
)

if adata.n_vars > n_hvgs:
print(f"Select {par['hvgs']} highly variable genes", flush=True)
hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs)
print("\n>>> Loading config...", flush=True)
config = op.project.read_viash_config(meta["config"])

print("Subsetting to HVG dimensions", flush=True)
adata = adata[:, adata.var_names.isin(hvg_list)].copy()
print("\n>>> Reading input files...", flush=True)
print(f"Input H5AD file: '{par['input']}'", flush=True)
adata = ad.read_h5ad(par["input"])
print(adata, flush=True)

print(">> Recompute HVG", flush=True)
out = sc.pp.highly_variable_genes(
adata,
layer="normalized",
n_top_genes=n_hvgs,
flavor="cell_ranger",
inplace=False
)
adata.var["hvg"] = out["highly_variable"].values
adata.var["hvg_score"] = out["dispersions_norm"].values
n_hvgs = par["hvgs"]
if adata.n_vars < n_hvgs:
print(f"\n>>> Using all {adata.n_vars} features as HVGs...", flush=True)
n_hvgs = adata.n_vars
hvg_list = adata.var_names.tolist()
else:
print(f"\n>>> Computing {n_hvgs} batch-aware HVGs...", flush=True)
hvg_list = compute_batched_hvg(adata, n_hvgs=n_hvgs)

print(">> Recompute PCA", flush=True)
n_components = adata.obsm["X_pca"].shape[1]
print(f"\n>>> Computing PCA with {n_components} components using HVGs...", flush=True)
hvg_mask = adata.var_names.isin(hvg_list)
X_pca, loadings, variance, variance_ratio = sc.pp.pca(
adata.layers["normalized"],
adata.layers["normalized"],
n_comps=n_components,
return_info=True
mask_var=hvg_mask,
return_info=True,
)
adata.obsm["X_pca"] = X_pca
adata.varm["pca_loadings"] = loadings.T
adata.uns["pca_variance"] = {
"variance": variance,
"variance_ratio": variance_ratio
}
adata.varm["pca_loadings"] = np.zeros(shape=(adata.n_vars, n_components))
adata.varm["pca_loadings"][hvg_mask, :] = loadings.T
adata.uns["pca_variance"] = {"variance": variance, "variance_ratio": variance_ratio}

print(">> Recompute neighbors", flush=True)
print("\n>>> Computing neighbours using PCA...", flush=True)
del adata.uns["knn"]
del adata.obsp["knn_connectivities"]
del adata.obsp["knn_distances"]
sc.pp.neighbors(adata, use_rep="X_pca", n_neighbors=30, key_added="knn")

print(">> Create output object", flush=True)
output_dataset = subset_h5ad_by_format(
adata,
config,
"output_dataset"
print("\n>>> Recomputing standard HVG scores...", flush=True)
out = sc.pp.highly_variable_genes(
adata, layer="normalized", n_top_genes=n_hvgs, flavor="cell_ranger", inplace=False
)
output_solution = subset_h5ad_by_format(
adata,
config,
"output_solution"
adata.var["hvg"] = out["highly_variable"].values
adata.var["hvg_score"] = out["dispersions_norm"].values

print("\n>>> Creating dataset output object...", flush=True)
output_dataset = subset_h5ad_by_format(
adata[:, adata.var_names.isin(hvg_list)].copy(), config, "output_dataset"
)
print(output_dataset, flush=True)

print("Writing adatas to file", flush=True)
print("\n>>> Creating solution output object...", flush=True)
output_solution = subset_h5ad_by_format(adata, config, "output_solution")
print(output_solution, flush=True)

print("\n>>> Writing output to files...", flush=True)
print(f"Dataset H5AD file: '{par['output_dataset']}'", flush=True)
output_dataset.write_h5ad(par["output_dataset"], compression="gzip")
print(f"Solution H5AD file: '{par['output_solution']}'", flush=True)
output_solution.write_h5ad(par["output_solution"], compression="gzip")

print("\n>>> Done!", flush=True)
6 changes: 5 additions & 1 deletion src/metrics/hvg_overlap/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@
adata_integrated = adata_integrated[~adata_integrated.obs["batch"].isin(skip_batches)]

print("Compute score", flush=True)
score = hvg_overlap(adata_solution, adata_integrated, batch_key="batch")
score = hvg_overlap(
adata_solution[:, adata_solution.var_names.isin(adata_integrated.var_names)],
adata_integrated,
batch_key="batch"
)

print("Create output AnnData object", flush=True)
output = ad.AnnData(
Expand Down
5 changes: 3 additions & 2 deletions src/metrics/pcr/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
)
adata_integrated = read_anndata(
par['input_integrated'],
var='var',
obs='obs',
obsm='obsm',
uns='uns'
Expand All @@ -39,11 +40,11 @@

print('compute score', flush=True)
score = pcr_comparison(
adata_solution,
adata_solution[:, adata_solution.var_names.isin(adata_integrated.var_names)],
adata_integrated,
embed='X_emb',
covariate='batch',
verbose=False
verbose=True
)

print('Create output AnnData object', flush=True)
Expand Down

0 comments on commit 83da5db

Please sign in to comment.