Skip to content

Commit

Permalink
benchmark all added
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Sep 18, 2024
1 parent d72e5b1 commit f95029d
Show file tree
Hide file tree
Showing 19 changed files with 384 additions and 37 deletions.
41 changes: 22 additions & 19 deletions scripts/run_benchmark_single_omics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@

# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
RUN_ID="single_omics_inference"
# resources_dir="./resources_test/"
resources_dir="s3://openproblems-data/resources/grn"
resources_dir="./resources_test/"
# resources_dir="s3://openproblems-data/resources/grn"
publish_dir="${resources_dir}/results/${RUN_ID}"


reg_type=ridge
subsample=-2
max_workers=10
layer='scgen_pearson'
metric_ids="[regression_1, regression_2]"
metric_ids="[regression_1]"
cell_type_specific=true #for controls
# method_ids="[tigress, ennet, scsgl, pidc]"
method_ids="[scenic]"
method_ids="[pearson_corr, pearson_causal, positive_control]"

param_file="./params/${RUN_ID}.yaml"

Expand All @@ -24,24 +25,26 @@ param_list:
metric_ids: $metric_ids
method_ids: $method_ids
perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna_0.h5ad
multiomics_rna: ${resources_dir}/grn-benchmark/multiomics_rna.h5ad
multiomics_atac: ${resources_dir}/grn-benchmark/multiomics_atac.h5ad
reg_type: $reg_type
subsample: $subsample
max_workers: $max_workers
layer: $layer
consensus: ${resources_dir}/prior/consensus-num-regulators.json
tf_all: ${resources_dir}/prior/tf_all.csv
cell_type_specific: ${cell_type_specific}
output_state: "state.yaml"
publish_dir: "$publish_dir"
HERE

# nextflow run . \
# -main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \
# -profile docker \
# -with-trace \
# -c src/common/nextflow_helpers/labels_ci.config \
# -params-file ${param_file}
nextflow run . \
-main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \
-profile docker \
-with-trace \
-c src/common/nextflow_helpers/labels_ci.config \
-params-file ${param_file}

# ./tw-windows-x86_64.exe launch `
# https://github.com/openproblems-bio/task_grn_inference.git `
Expand All @@ -53,11 +56,11 @@ HERE
# --params-file ./params/single_omics_inference.yaml `
# --config src/common/nextflow_helpers/labels_tw.config

./tw launch https://github.com/openproblems-bio/task_grn_inference \
--revision build/main \
--pull-latest \
--main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file ${param_file} \
--config src/common/nextflow_helpers/labels_tw.config
# ./tw launch https://github.com/openproblems-bio/task_grn_inference \
# --revision build/main \
# --pull-latest \
# --main-script target/nextflow/workflows/run_benchmark_single_omics/main.nf \
# --workspace 53907369739130 \
# --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
# --params-file ${param_file} \
# --config src/common/nextflow_helpers/labels_tw.config
3 changes: 2 additions & 1 deletion src/api/comp_method.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@ functionality:
- name: --cell_type_specific
type: boolean
direction: input
default: false
default: true




Expand Down
1 change: 0 additions & 1 deletion src/control_methods/baseline_corr/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ functionality:
label: baseline_corr
summary: "Baseline based on correlation"
arguments:

- name: --causal
type: boolean
direction: input
Expand Down
7 changes: 5 additions & 2 deletions src/control_methods/baseline_corr/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ def corr_net(X, gene_names, par):
return net

def create_corr_net(X, gene_names, groups, par):
if par['cell_type_specific']:
# if par['cell_type_specific']:
if True:
i = 0
for group in tqdm(np.unique(groups), desc="Processing groups"):
X_sub = X[groups == group, :]
Expand Down Expand Up @@ -96,7 +97,9 @@ def create_meta_cells(df, n_cells=15):
tf_all = np.intersect1d(tf_all, gene_names)

print('Noramlize data')
multiomics_rna.X = multiomics_rna.layers['lognorm']
sc.pp.normalize_total(multiomics_rna)
sc.pp.log1p(multiomics_rna)
sc.pp.scale(multiomics_rna)

if par['impute']:
print("imputing")
Expand Down
24 changes: 24 additions & 0 deletions src/control_methods/pearson/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
__merge__: ../../api/comp_method.yaml

functionality:
name: pearson_corr
namespace: control_methods
info:
label: pearson_corr
summary: "Baseline based on correlation"

resources:
- type: python_script
path: script.py

platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
# packages: [ magic-impute ]
packages: [ ]
- type: native
- type: nextflow
directives:
label: [midtime, midmem, midcpu]
76 changes: 76 additions & 0 deletions src/control_methods/pearson/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import os
import pandas as pd
import numpy as np
import anndata as ad
import scanpy as sc
from tqdm import tqdm
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler

## VIASH START
par = {
'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad',
'tf_all': 'resources/prior/tf_all.csv',
'causal': False,
'cell_type_specific': True,
'max_n_links': 50000,
'prediction': 'resources/grn_models/donor_0_default/pearson.csv',
"seed": 32
}
## VIASH END
print(par)

def process_links(net, par):
net = net[net.source!=net.target]
net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
return net

def corr_net(X, gene_names, par):
X = StandardScaler().fit_transform(X)
net = np.dot(X.T, X) / X.shape[0]
net = pd.DataFrame(net, index=gene_names, columns=gene_names)
net = net.sample(len(tf_all), axis=1, random_state=par['seed'])
net = net.reset_index()
index_name = net.columns[0]
net = net.melt(id_vars=index_name, var_name='source', value_name='weight')

net.rename(columns={index_name: 'target'}, inplace=True)
net = process_links(net, par)

return net

def create_corr_net(X, gene_names, groups, par):
if par['cell_type_specific']:
i = 0
for group in tqdm(np.unique(groups), desc="Processing groups"):
X_sub = X[groups == group, :]
net = corr_net(X_sub, gene_names, par)
net['cell_type'] = group
if i==0:
grn = net
else:
grn = pd.concat([grn, net], axis=0).reset_index(drop=True)
i += 1
else:
grn = corr_net(X, gene_names, par)
return grn
print('Read data')
multiomics_rna = ad.read_h5ad(par["multiomics_rna"])


gene_names = multiomics_rna.var_names.to_numpy()
tf_all = np.loadtxt(par['tf_all'], dtype=str)
groups = multiomics_rna.obs.cell_type
tf_all = np.intersect1d(tf_all, gene_names)

print('Noramlize data')
sc.pp.normalize_total(multiomics_rna)
sc.pp.log1p(multiomics_rna)
sc.pp.scale(multiomics_rna)

print('Create corr net')
net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par)

print('Output GRN')
net.to_csv(par['prediction'])
5 changes: 5 additions & 0 deletions src/control_methods/pearson/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
viash run src/control_methods/baseline_corr/config.vsh.yaml -- \
--prediction output/baseline_corr.csv \
--multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
--tf_all resources/prior/tf_all.csv \
--causal true
24 changes: 24 additions & 0 deletions src/control_methods/pearson_causal/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
__merge__: ../../api/comp_method.yaml

functionality:
name: pearson_causal
namespace: control_methods
info:
label: pearson_causal
summary: "Baseline based on correlation"

resources:
- type: python_script
path: script.py

platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
# packages: [ magic-impute ]
packages: [ ]
- type: native
- type: nextflow
directives:
label: [midtime, midmem, midcpu]
77 changes: 77 additions & 0 deletions src/control_methods/pearson_causal/script.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
import pandas as pd
import numpy as np
import anndata as ad
import scanpy as sc
from tqdm import tqdm
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler

## VIASH START
par = {
'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad',
'tf_all': 'resources/prior/tf_all.csv',
'causal': True,
'cell_type_specific': True,
'max_n_links': 50000,
'prediction': 'resources/grn_models/donor_0_default/pearson_causal.csv',
"seed": 32
}
## VIASH END
print(par)

def process_links(net, par):
net = net[net.source!=net.target]
net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
return net

def corr_net(X, gene_names, par):
X = StandardScaler().fit_transform(X)
net = np.dot(X.T, X) / X.shape[0]
net = pd.DataFrame(net, index=gene_names, columns=gene_names)

net = net[tf_all]
net = net.reset_index()
index_name = net.columns[0]
net = net.melt(id_vars=index_name, var_name='source', value_name='weight')

net.rename(columns={index_name: 'target'}, inplace=True)
net = process_links(net, par)

return net

def create_corr_net(X, gene_names, groups, par):
if par['cell_type_specific']:
i = 0
for group in tqdm(np.unique(groups), desc="Processing groups"):
X_sub = X[groups == group, :]
net = corr_net(X_sub, gene_names, par)
net['cell_type'] = group
if i==0:
grn = net
else:
grn = pd.concat([grn, net], axis=0).reset_index(drop=True)
i += 1
else:
grn = corr_net(X, gene_names, par)
return grn
print('Read data')
multiomics_rna = ad.read_h5ad(par["multiomics_rna"])


gene_names = multiomics_rna.var_names.to_numpy()
tf_all = np.loadtxt(par['tf_all'], dtype=str)
groups = multiomics_rna.obs.cell_type
tf_all = np.intersect1d(tf_all, gene_names)

print('Noramlize data')
sc.pp.normalize_total(multiomics_rna)
sc.pp.log1p(multiomics_rna)
sc.pp.scale(multiomics_rna)

print('Create corr net')
net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par)

print('Output GRN')
net.to_csv(par['prediction'])
5 changes: 5 additions & 0 deletions src/control_methods/pearson_causal/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
viash run src/control_methods/baseline_corr/config.vsh.yaml -- \
--prediction output/baseline_corr.csv \
--multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad \
--tf_all resources/prior/tf_all.csv \
--causal true
29 changes: 29 additions & 0 deletions src/control_methods/positive_control/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
__merge__: ../../api/comp_method.yaml

functionality:
name: positive_control
namespace: control_methods
info:
label: positive_control
summary: "Baseline based on correlation"
arguments:
- name: --perturbation_data
type: file
required: true
direction: input
example: resources_test/grn-benchmark/perturbation_data.h5ad
resources:
- type: python_script
path: script.py

platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
# packages: [ magic-impute ]
packages: [ ]
- type: native
- type: nextflow
directives:
label: [midtime, midmem, midcpu]
Loading

0 comments on commit f95029d

Please sign in to comment.