Skip to content

Commit

Permalink
metrics all is added. readme updated
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Jan 11, 2025
1 parent 50c65b3 commit b3e2d67
Show file tree
Hide file tree
Showing 20 changed files with 522 additions and 306 deletions.
278 changes: 134 additions & 144 deletions runs.ipynb

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions scripts/calculate_score.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# bash src/metrics/all_metrics/run.sh resources/grn_models/norman/grnboost2.csv norman

prediction=${1}
dataset_id=${2}

viash run src/metrics/all_metrics/config.novsh.yaml -- \
--prediction ${prediction} \
--dataset_id ${dataset_id} \
--score output/score.h5ad \
--tf_all resources/prior/tf_all.csv \
--regulators_consensus resources/prior/regulators_consensus_${dataset_id}.json \
--ws_consensus resources/prior/ws_consensus_${dataset_id}.csv \
--ws_distance_background resources/prior/ws_distance_background_${dataset_id}.csv \
--evaluation_data_sc resources/evaluation_datasets/${dataset_id}_sc_counts.h5ad \
--evaluation_data resources/evaluation_datasets/${dataset_id}_perturbation.h5ad
26 changes: 8 additions & 18 deletions scripts/download_resources.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,33 +5,23 @@ set -e
echo ">> Downloading resources"

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources/grn/grn-benchmark" \
--output "resources/grn-benchmark" \
--input "s3://openproblems-data/resources/grn/inference_datasets/" \
--output "resources/inference_datasets/" \
--delete

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources/grn/prior" \
--output "resources/prior" \
--input "s3://openproblems-data/resources/grn/evaluation_datasets/" \
--output "resources/evaluation_datasets/" \
--delete

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources/grn/grn_models" \
--output "resources/grn_models" \
--delete
echo ">> Downloading resources test"
viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources_test/grn/grn-benchmark" \
--output "resources_test/grn-benchmark" \
--delete

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources_test/grn/prior" \
--output "resources_test/prior" \
--input "s3://openproblems-data/resources/grn/prior" \
--output "resources/prior" \
--delete

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources_test/grn/grn_models" \
--output "resources_tests/grn_models" \
--input "s3://openproblems-data/resources/grn/grn_models/" \
--output "resources/grn_models/" \
--delete


13 changes: 13 additions & 0 deletions scripts/download_resources_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

set -e

echo ">> Downloading resources"

viash run src/common/sync_test_resources/config.vsh.yaml -- \
--input "s3://openproblems-data/resources/grn/" \
--output "resources/" \
--delete



2 changes: 1 addition & 1 deletion scripts/render_readme.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
set -e

viash run src/common/create_task_readme/config.vsh.yaml -- \
--task "grn_benchmark" \
--task "grn_inference" \
--task_dir "src" \
--github_url "https://github.com/openproblems-bio/task_grn_inference/tree/main/" \
--output "README.md"
9 changes: 0 additions & 9 deletions scripts/upload_resources.sh

This file was deleted.

41 changes: 2 additions & 39 deletions src/api/comp_metric.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,48 +8,19 @@ functionality:
description: |
A metric to evaluate the performance of the inferred GRN
arguments:
- name: --evaluation_data
__merge__: file_evaluation_h5ad.yaml
required: false
direction: input
- name: --prediction
__merge__: file_prediction.yaml
required: true
direction: input
- name: --score
__merge__: file_score.yaml
required: false
direction: output
- name: --tf_all
type: file
direction: input
required: true
example: resources_test/prior/tf_all.csv
- name: --reg_type
type: string
direction: input
default: ridge
description: name of regretion to use
multiple: false
- name: --subsample
type: integer
direction: input
default: -1
description: number of samples randomly drawn from perturbation data
- name: --num_workers
type: integer
direction: input
default: 4
direction: output
- name: --method_id
type: string
direction: input
required: false
example: collectri
- name: --apply_tf
type: boolean
required: false
default: true

example: grnboost2
- name: --layer
type: string
direction: input
Expand All @@ -62,14 +33,6 @@ functionality:
type: integer
default: 2
direction: input
- name: --skeleton
type: file
direction: input
example: resources_test/prior/skeleton.csv
- name: --apply_skeleton
type: boolean
direction: input
default: false
- name: --dataset_id
type: string
direction: input
Expand Down
42 changes: 42 additions & 0 deletions src/api/comp_metric_regression.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
__merge__: comp_metric.yaml
functionality:
name: metrics_regression
namespace: "metrics"
info:
label: metrics_regression
summary: Calculates regression scores
arguments:
- name: --evaluation_data
__merge__: file_evaluation_h5ad.yaml
required: false
direction: input
- name: --tf_all
type: file
direction: input
required: true
example: resources_test/prior/tf_all.csv
- name: --reg_type
type: string
direction: input
default: ridge
description: name of regretion to use
multiple: false
- name: --subsample
type: integer
direction: input
default: -1
description: number of samples randomly drawn from perturbation data
- name: --num_workers
type: integer
direction: input
default: 4
- name: --apply_tf
type: boolean
required: false
default: true
- name: --apply_skeleton
type: boolean
required: false
default: false


26 changes: 26 additions & 0 deletions src/api/comp_metric_ws.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
__merge__: comp_metric.yaml
functionality:
name: ws_distance
namespace: "metrics"
info:
label: ws_distance
summary: Calculates Wasserstein distance for a given GRN and dataset
arguments:
- name: --ws_consensus
type: file
direction: input
must_exist: false
required: true
example: resources_test/prior/ws_consensus_norman.csv
- name: --ws_distance_background
type: file
direction: input
must_exist: false
required: true
example: resources_test/prior/ws_distance_background_norman.csv
- name: --evaluation_data_sc
type: file
required: true
direction: input
example: 'resources_test/datasets_raw/adamson_sc_counts.h5ad'

39 changes: 21 additions & 18 deletions src/api/task_info.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
name: GRN inference benchmark
label: A dynamic benchmark for gene regulatory network (GRN) inference
label: Living benchmark for gene regulatory network (GRN) inference
motivation: |
GRNs are essential for understanding cellular identity and behavior. They are simplified models of gene expression regulated by complex processes involving multiple layers of control, from transcription to post-transcriptional modifications, incorporating various regulatory elements and non-coding RNAs. Gene transcription is controlled by a regulatory complex that includes transcription factors (TFs), cis-regulatory elements (CREs) like promoters and enhancers, and essential co-factors. High-throughput datasets, covering thousands of genes, facilitate the use of machine learning approaches to decipher GRNs. The advent of single-cell sequencing technologies, such as scRNA-seq, has made it possible to infer GRNs from a single experiment due to the abundance of samples. This allows researchers to infer condition-specific GRNs, such as for different cell types or diseases, and study potential regulatory factors associated with these conditions. Combining chromatin accessibility data with gene expression measurements has led to the development of enhancer-driven GRN (eGRN) inference pipelines, which offer significantly improved accuracy over single-modality methods.
description: |
Here, we present a dynamic benchmark platform for GRN inference. This platform provides curated datasets for GRN inference and evaluation, standardized evaluation protocols and metrics, computational infrastructure, and a dynamically updated leaderboard to track state-of-the-art methods. It runs novel GRNs in the cloud, offers competition scores, and stores them for future comparisons, reflecting new developments over time.
Here, we present geneRNIB as a living benchmark platform for GRN inference. This platform provides curated datasets for GRN inference and evaluation, standardized evaluation protocols and metrics, computational infrastructure, and a dynamically updated leaderboard to track state-of-the-art methods. It runs novel GRNs in the cloud, offers competition scores, and stores them for future comparisons, reflecting new developments over time.
The platform supports the integration of new datasets and protocols. When a new feature is added, previously evaluated GRNs are re-assessed, and the leaderboard is updated accordingly. The aim is to evaluate both the accuracy and completeness of inferred GRNs. It is designed for both single-modality and multi-omics GRN inference. Ultimately, it is a community-driven platform. So far, six eGRN inference methods have been integrated: Scenic+, CellOracle, FigR, scGLUE, GRaNIE, and ANANSE.
The platform supports the integration of new datasets and protocols. When a new feature is added, previously evaluated GRNs are re-assessed, and the leaderboard is updated accordingly. The aim is to evaluate both the accuracy and completeness of inferred GRNs. It is designed for both single-modality and multi-omics GRN inference. Ultimately, it is a community-driven platform.
So far, ten GRN inference methods have been integrated: five sinlge-omics methods of GRNBoost2, GENIE3, Portia, PPCOR, and Scenic; and five eGRN inference methods of Scenic+, CellOracle, FigR, scGLUE, and GRaNIE.
Due to its flexible nature, the platform can incorporate various benchmark datasets and evaluation methods, using either prior knowledge or feature-based approaches.
In the current version, due to the absence of standardized prior knowledge, we use indirect approaches to benchmark GRNs. Employing interventional data as evaluation datasets, we have developed 8 metrics using feature-based approach and Wasserstein distance, accounting for both accuracy and comprehensiveness.
Five datasets have been integrated so far, namely OPSCA, Nakatake, Norman, Adamson, and Replogle. For each dataset, standardized inference datasets are provided to be used for GRN inference and evaluation datasets are employed to benchmark.
See our publication for the details of methods.
Due to its flexible nature, the platform can incorporate various benchmark datasets and evaluation methods, using either prior knowledge or feature-based approaches. In the current version, due to the absence of standardized prior knowledge, we use a feature-based approach to benchmark GRNs. Our evaluation utilizes standardized datasets for GRN inference and evaluation, employing multiple regression analysis approaches to assess both accuracy and comprehensiveness.
summary: |
Benchmarking GRN inference methods
Expand All @@ -28,21 +35,16 @@ readme: |
# download resources
scripts/download_resources.sh
```
The datasets for GRN inference are located in `resources/inference_datasets`.
## Infer a GRN
```bash
viash run src/methods/dummy/config.vsh.yaml -- --multiomics_rna resources/grn-benchmark/multiomics_rna.h5ad --multiomics_atac resources/grn-benchmark/multiomics_atac.h5ad --prediction output/dummy.csv
```
Similarly, run the command for other methods.
One GRN should be inferred for each inference dataset (op, norman, replogle2, adamson, and nakatake). The inferred GRN should have three columns of `source, target, weight`. See `resources/grn_models/op/grnboost2.csv` as an example.
## Evaluate a GRN
Once a GRN is inferred (e.g. located in `output/your_GRN.csv`) for a given dataset (e.g. `norman`), use the following code to obtain evaluation scores.
```bash
scripts/benchmark_grn.sh --grn resources/grn-benchmark/models/collectri.csv
scripts/calculate_score.sh output/your_GRN.csv norman
```
Similarly, run the command for other GRN models.
This will calculate and print the scores as well as output the scores into `output/score.h5ad`
## Add a method
Expand All @@ -64,14 +66,15 @@ authors:
roles: [ contributor ]
info:
github: AntoinePassemiers
- name: Christian Arnold
roles: [ contributor ]
info:
github: chrarnold
- name: Marco Stock
roles: [ contributor ]
info:
github: stkmrc
- name: Christian Arnold
roles: [ contributor ]
info:
github: chrarnold




97 changes: 97 additions & 0 deletions src/metrics/all_metrics/config.novsh.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@

__merge__: ../../api/comp_metric.yaml

functionality:
name: metrics_all
info:
label: metrics_all
summary: Calculates all metrics for a given GRN and dataset
arguments:
- name: --evaluation_data
type: file
required: true
direction: input
- name: --tf_all
type: file
direction: input
required: true
example: resources_test/prior/tf_all.csv
- name: --reg_type
type: string
direction: input
default: ridge
description: name of regretion to use
multiple: false
- name: --subsample
type: integer
direction: input
default: -1
description: number of samples randomly drawn from perturbation data
- name: --num_workers
type: integer
direction: input
default: 4
- name: --apply_tf
type: boolean
required: false
default: true
- name: --apply_skeleton
type: boolean
required: false
default: false
- name: --regulators_consensus
type: file
direction: input
must_exist: false
required: true
example: resources_test/prior/regulators_consensus_norman.json
- name: --static_only
direction: input
type: boolean
default: true
- name: --binarize
type: boolean
direction: input
description: whether to binarize the weight
default: true
- name: --ws_consensus
type: file
direction: input
must_exist: false
required: true
example: resources_test/prior/ws_consensus_norman.csv
- name: --ws_distance_background
type: file
direction: input
must_exist: false
required: true
example: resources_test/prior/ws_distance_background_norman.csv
- name: --evaluation_data_sc
type: file
required: true
direction: input
example: 'resources_test/datasets_raw/adamson_sc_counts.h5ad'


resources:
- type: python_script
path: script.py
- path: /src/utils/util.py
dest: util.py
- path: /src/metrics/regression_1/main.py
dest: reg1_main.py
- path: /src/metrics/regression_2/main.py
dest: reg2_main.py
- path: /src/metrics/wasserstein/main.py
dest: ws_main.py


platforms:
- type: docker
image: ghcr.io/openproblems-bio/base_python:1.0.4
setup:
- type: python
packages: [ lightgbm==4.3.0, numpy==1.26.4 ]
- type: nextflow
directives:
label: [ midtime, midmem, midcpu ]
Loading

0 comments on commit b3e2d67

Please sign in to comment.