From 2ed4898a445f60fcb4b4db15ee9bd4ad84f0cc2a Mon Sep 17 00:00:00 2001
From: Robrecht Cannoodt <rcannood@gmail.com>
Date: Thu, 31 Oct 2024 09:04:23 +0100
Subject: [PATCH] update results

---
 .../data/dataset_info.json                    |   9 +-
 .../data/method_info.json                     |  64 +-
 .../data/metric_execution_info.json           | 160 +--
 .../data/metric_info.json                     |  40 +-
 .../data/quality_control.json                 | 986 +++++++++---------
 .../perturbation_prediction/data/results.json | 292 +++---
 .../data/task_info.json                       |  88 +-
 7 files changed, 859 insertions(+), 780 deletions(-)

diff --git a/results/perturbation_prediction/data/dataset_info.json b/results/perturbation_prediction/data/dataset_info.json
index 85222a91..d9ff0bb2 100644
--- a/results/perturbation_prediction/data/dataset_info.json
+++ b/results/perturbation_prediction/data/dataset_info.json
@@ -1,13 +1,12 @@
 [
   {
-    "task_id": "perturbation_prediction",
     "dataset_id": "neurips-2023-data",
     "dataset_name": "NeurIPS2023 scPerturb DGE",
     "dataset_summary": "Differential gene expression sign(logFC) * -log10(p-value) values after 24 hours of treatment with 144 compounds in human PBMCs",
     "dataset_description": "For this competition, we designed and generated a novel single-cell perturbational dataset in human peripheral blood mononuclear cells (PBMCs). We selected 144 compounds from the Library of Integrated Network-Based Cellular Signatures (LINCS) Connectivity Map dataset (PMID: 29195078) and measured single-cell gene expression profiles after 24 hours of treatment. The experiment was repeated in three healthy human donors, and the compounds were selected based on diverse transcriptional signatures observed in CD34+ hematopoietic stem cells (data not released). We performed this experiment in human PBMCs because the cells are commercially available with pre-obtained consent for public release and PBMCs are a primary, disease-relevant tissue that contains multiple mature cell types (including T-cells, B-cells, myeloid cells, and NK cells) with established markers for annotation of cell types. To supplement this dataset, we also measured cells from each donor at baseline with joint scRNA and single-cell chromatin accessibility measurements using the 10x Multiome assay. We hope that the addition of rich multi-omic data for each donor and cell type at baseline will help establish biological priors that explain the susceptibility of particular genes to exhibit perturbation responses in difference biological contexts.",
-    "data_reference": "TBD",
-    "data_url": "TBD",
-    "date_created": "02-06-2024",
-    "file_size": 183170148
+    "data_reference": "@article{slazata2024benchmark,\n\ttitle = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types},\n\tauthor = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt},\n\tbooktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n\tyear = {2024},\n\turl = {https://openreview.net/forum?id=WTI4RJYSVm}\n}",
+    "data_url": "https://trace.ncbi.nlm.nih.gov/Traces/?view=study&acc=SRP527159",
+    "date_created": "31-10-2024",
+    "file_size": 183168750
   }
 ]
diff --git a/results/perturbation_prediction/data/method_info.json b/results/perturbation_prediction/data/method_info.json
index d5f21d9f..8ef7afd8 100644
--- a/results/perturbation_prediction/data/method_info.json
+++ b/results/perturbation_prediction/data/method_info.json
@@ -8,9 +8,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/ground_truth/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/ground_truth/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -21,9 +21,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/mean_outcome/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/mean_outcome/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -34,9 +34,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/mean_across_celltypes/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/mean_across_celltypes/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -47,9 +47,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/mean_across_compounds/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/mean_across_compounds/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -60,9 +60,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/sample/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/sample/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -73,9 +73,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/zeros/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/zeros/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -85,10 +85,10 @@
     "method_description": "An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings,\none-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression.\nThe models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their\nrobustness and predictive performance. The approach also included data augmentation techniques to ensure\ngeneralization and account for noise in the data.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/lgc_ensemble/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/lgc_ensemble/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -98,23 +98,23 @@
     "method_description": "The prediction system is two staged, so I publish two versions of the notebook.\nThe first stage predicts pseudolabels. To be honest, if I stopped on this version, I would not be the third.\nThe predicted pseudolabels on all test data (255 rows) are added to training in the second stage.\n\n**Stage 1 preparing pseudolabels**: The main part of this system is a neural network. Every neural network and its environment was optimized by optuna. Hyperparameters that have been optimized:\na dropout value, a number of neurons in particular layers, an output dimension of an embedding layer, a number of epochs, a learning rate, a batch size, a number of dimension of truncated singular value decomposition.\nThe optimization was done on custom 4-folds cross validation. In order to avoid overfitting to cross validation by optuna I applied 2 repeats for every fold and took an average. Generally, the more, the better. The optuna's criterion was MRRMSE.\nFinally, 7 models were ensembled. Optuna was applied again to determine best weights of linear combination. The prediction of test set is the pseudolabels now and will be used in second stage.\n\n**Stage 2 retraining with pseudolabels**: The pseudolabels (255 rows) were added to the training dataset. I applied 20 models with optimized parameters in different experiments for a model diversity.\nOptuna selected optimal weights for the linear combination of the prediction again.\nModels had high variance, so every model was trained 10 times on all dataset and the median of prediction is taken as a final prediction. The prediction was additionally clipped to colwise min and max. \n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/okon2000/single_cell_perturbations",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/nn_retraining_with_pseudolabels/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
     "method_id": "jn_ap_op2",
     "method_name": "JN-AP-OP2",
     "method_summary": "Deep learning architecture composed of 2 modules: a sample-centric MLP and a gene-centric MLP",
-    "method_description": "We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode,\nwhere n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data.\nThe purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it\nto MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene)\ncombination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples. \n",
+    "method_description": "We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode,\nwhere n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data.\nThe purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it\nto MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene)\ncombination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/AntoinePassemiers/Open-Challenges-Single-Cell-Perturbations",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/jn_ap_op2/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/jn_ap_op2/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -123,11 +123,11 @@
     "method_summary": "Neural network model for drug effect prediction",
     "method_description": "ScAPE is utilises a neural network (NN) model to estimate drug effects on gene expression in\nperipheral blood mononuclear cells (PBMCs). The model took drug and cell features as input,\nwith these features primarily derived from the median of signed log-pvalues and log fold-changes\ngrouped by drug and cell type. The NN was trained using a leave-one-drug-out cross-validation\nstrategy, focusing on NK cells as a representative cell type due to their similarity to B cells\nand Myeloid cells in principal component analysis. Model performance was evaluated by comparing\nits predictions against two baselines: predicting zero effect and predicting the median\nlog-pvalue for each drug. The final submission combined predictions from models trained on\ndifferent gene and drug subsets, aiming to enhance overall prediction accuracy.\n",
     "is_baseline": false,
-    "paper_reference": "pablormier2023scape",
-    "code_url": "https://github.com/scapeML/scape",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/scape/config.vsh.yaml",
+    "paper_reference": null,
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/scape/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -137,10 +137,10 @@
     "method_description": "This method employs an ensemble of four transformer models,\neach with different weights and trained on slightly varying feature sets.\nThe feature engineering process involved one-hot encoding of categorical labels,\ntarget encoding using mean and standard deviation, and enriching the feature set\nwith the standard deviation of target variables. Additionally, the dataset was\ncarefully examined to ensure data cleanliness. A sophisticated sampling strategy\nbased on K-Means clustering was employed to partition the data into training and\nvalidation sets, ensuring a representative distribution. The model architecture\nleveraged sparse and dense feature encoding, along with a transformer for effective\nlearning.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/Eliorkalfon/single_cell_pb",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/transformer_ensemble/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/transformer_ensemble/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -150,9 +150,9 @@
     "method_description": "An ensemble of four models was considered: \n\n* Py-boost (a ridge regression-based recommender system)\n* ExtraTrees (a decision tree ensemble with target-encoded features)\n* a k-nearest neighbors recommender system\n* a ridge regression model\n\nEach model offered distinct strengths and weaknesses: ExtraTrees and\nknn were unable to extrapolate beyond the training data, while ridge\nregression provided extrapolation capability. To enhance model performance,\ndata augmentation techniques were used, including averaging differential\nexpressions for compound mixtures and adjusting cell counts to reduce biases.\n\nIn the end, only the py-boost model is used for generating predictions.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/Ambros-M/Single-Cell-Perturbations-2023",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/pyboost/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/pyboost/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   }
 ]
diff --git a/results/perturbation_prediction/data/metric_execution_info.json b/results/perturbation_prediction/data/metric_execution_info.json
index 8036a481..76773f81 100644
--- a/results/perturbation_prediction/data/metric_execution_info.json
+++ b/results/perturbation_prediction/data/metric_execution_info.json
@@ -6,10 +6,10 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.3,
-      "cpu_pct": 248.2,
+      "duration_sec": 7.6,
+      "cpu_pct": 226.9,
       "peak_memory_mb": 3482,
-      "disk_read_mb": 139,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -20,10 +20,10 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.6,
-      "cpu_pct": 284.6,
-      "peak_memory_mb": 4199,
-      "disk_read_mb": 136,
+      "duration_sec": 5.9,
+      "cpu_pct": 377.6,
+      "peak_memory_mb": 6144,
+      "disk_read_mb": 135,
       "disk_write_mb": 1
     }
   },
@@ -34,9 +34,9 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.8,
-      "cpu_pct": 245.5,
-      "peak_memory_mb": 4199,
+      "duration_sec": 4.8,
+      "cpu_pct": 450,
+      "peak_memory_mb": 6144,
       "disk_read_mb": 138,
       "disk_write_mb": 1
     }
@@ -48,9 +48,9 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.2,
-      "cpu_pct": 316.5,
-      "peak_memory_mb": 4813,
+      "duration_sec": 5.8,
+      "cpu_pct": 368.8,
+      "peak_memory_mb": 6144,
       "disk_read_mb": 134,
       "disk_write_mb": 1
     }
@@ -62,9 +62,9 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.6,
-      "cpu_pct": 236.2,
-      "peak_memory_mb": 3482,
+      "duration_sec": 5,
+      "cpu_pct": 439.6,
+      "peak_memory_mb": 6144,
       "disk_read_mb": 136,
       "disk_write_mb": 1
     }
@@ -76,10 +76,10 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.4,
-      "cpu_pct": 244.6,
-      "peak_memory_mb": 3482,
-      "disk_read_mb": 134,
+      "duration_sec": 7.5,
+      "cpu_pct": 231.1,
+      "peak_memory_mb": 3584,
+      "disk_read_mb": 133,
       "disk_write_mb": 1
     }
   },
@@ -90,10 +90,10 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.2,
-      "cpu_pct": 324.6,
-      "peak_memory_mb": 4813,
-      "disk_read_mb": 139,
+      "duration_sec": 4.9,
+      "cpu_pct": 442.1,
+      "peak_memory_mb": 6144,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -104,10 +104,10 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 7.5,
-      "cpu_pct": 266.3,
-      "peak_memory_mb": 4813,
-      "disk_read_mb": 139,
+      "duration_sec": 5.9,
+      "cpu_pct": 386.2,
+      "peak_memory_mb": 6144,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -118,10 +118,10 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.2,
-      "cpu_pct": 324.5,
-      "peak_memory_mb": 4813,
-      "disk_read_mb": 139,
+      "duration_sec": 10.8,
+      "cpu_pct": 207.3,
+      "peak_memory_mb": 6144,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -132,10 +132,10 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 7.3,
-      "cpu_pct": 316.4,
+      "duration_sec": 5.1,
+      "cpu_pct": 363.1,
       "peak_memory_mb": 6144,
-      "disk_read_mb": 136,
+      "disk_read_mb": 135,
       "disk_write_mb": 1
     }
   },
@@ -146,8 +146,8 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 7.7,
-      "cpu_pct": 356.2,
+      "duration_sec": 5.8,
+      "cpu_pct": 393.2,
       "peak_memory_mb": 6144,
       "disk_read_mb": 133,
       "disk_write_mb": 1
@@ -160,9 +160,9 @@
     "metric_id": "mean_rowwise_correlation",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 7.5,
-      "cpu_pct": 228.6,
-      "peak_memory_mb": 3482,
+      "duration_sec": 11.1,
+      "cpu_pct": 192.7,
+      "peak_memory_mb": 6144,
       "disk_read_mb": 133,
       "disk_write_mb": 1
     }
@@ -174,10 +174,10 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.7,
-      "cpu_pct": 297.7,
-      "peak_memory_mb": 6144,
-      "disk_read_mb": 139,
+      "duration_sec": 6.1,
+      "cpu_pct": 262.4,
+      "peak_memory_mb": 3380,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -188,10 +188,10 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 5.5,
-      "cpu_pct": 270.2,
-      "peak_memory_mb": 3380,
-      "disk_read_mb": 136,
+      "duration_sec": 4.2,
+      "cpu_pct": 511.1,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 135,
       "disk_write_mb": 1
     }
   },
@@ -202,9 +202,9 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 5.5,
-      "cpu_pct": 267.1,
-      "peak_memory_mb": 3380,
+      "duration_sec": 6.6,
+      "cpu_pct": 247.5,
+      "peak_memory_mb": 6144,
       "disk_read_mb": 138,
       "disk_write_mb": 1
     }
@@ -216,8 +216,8 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.8,
-      "cpu_pct": 282.2,
+      "duration_sec": 7.6,
+      "cpu_pct": 283.3,
       "peak_memory_mb": 6144,
       "disk_read_mb": 134,
       "disk_write_mb": 1
@@ -230,8 +230,8 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.9,
-      "cpu_pct": 337.4,
+      "duration_sec": 8.7,
+      "cpu_pct": 249.4,
       "peak_memory_mb": 6144,
       "disk_read_mb": 136,
       "disk_write_mb": 1
@@ -244,10 +244,10 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.8,
-      "cpu_pct": 286.6,
-      "peak_memory_mb": 6144,
-      "disk_read_mb": 134,
+      "duration_sec": 6.2,
+      "cpu_pct": 259.8,
+      "peak_memory_mb": 3380,
+      "disk_read_mb": 133,
       "disk_write_mb": 1
     }
   },
@@ -258,10 +258,10 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 5.3,
-      "cpu_pct": 363.4,
-      "peak_memory_mb": 4813,
-      "disk_read_mb": 139,
+      "duration_sec": 4.4,
+      "cpu_pct": 455,
+      "peak_memory_mb": 6144,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -272,10 +272,10 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.6,
-      "cpu_pct": 325.9,
-      "peak_memory_mb": 6144,
-      "disk_read_mb": 139,
+      "duration_sec": 4,
+      "cpu_pct": 525,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -286,10 +286,10 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.8,
-      "cpu_pct": 383.2,
-      "peak_memory_mb": 6144,
-      "disk_read_mb": 139,
+      "duration_sec": 6.1,
+      "cpu_pct": 261.6,
+      "peak_memory_mb": 3380,
+      "disk_read_mb": 138,
       "disk_write_mb": 1
     }
   },
@@ -300,10 +300,10 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 5.7,
-      "cpu_pct": 259.2,
-      "peak_memory_mb": 3380,
-      "disk_read_mb": 136,
+      "duration_sec": 4.1,
+      "cpu_pct": 432.8,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 135,
       "disk_write_mb": 1
     }
   },
@@ -314,8 +314,8 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 11,
-      "cpu_pct": 241.6,
+      "duration_sec": 7.9,
+      "cpu_pct": 245.5,
       "peak_memory_mb": 6144,
       "disk_read_mb": 133,
       "disk_write_mb": 1
@@ -328,9 +328,9 @@
     "metric_id": "mean_rowwise_error",
     "resources": {
       "exit_code": 0,
-      "duration_sec": 12.3,
-      "cpu_pct": 206.4,
-      "peak_memory_mb": 6042,
+      "duration_sec": 6.4,
+      "cpu_pct": 249.6,
+      "peak_memory_mb": 3380,
       "disk_read_mb": 133,
       "disk_write_mb": 1
     }
diff --git a/results/perturbation_prediction/data/metric_info.json b/results/perturbation_prediction/data/metric_info.json
index f4377d15..a50c2a49 100644
--- a/results/perturbation_prediction/data/metric_info.json
+++ b/results/perturbation_prediction/data/metric_info.json
@@ -5,10 +5,12 @@
     "metric_name": "Mean Rowwise RMSE",
     "metric_summary": "The mean of the root mean squared error (RMSE) of each row in the matrix.",
     "metric_description": "We use the **Mean Rowwise Root Mean Squared Error** to score submissions, computed as follows:\n\n$$\n\\textrm{MRRMSE} = \\frac{1}{R}\\sum_{i=1}^R\\left(\\frac{1}{n} \\sum_{j=1}^{n} (y_{ij} - \\widehat{y}_{ij})^2\\right)^{1/2}\n$$\n\nwhere $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns.\n",
-    "paper_reference": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/metrics/mean_rowwise_error/config.vsh.yaml",
+    "paper_reference": {
+      "bibtex": "@article{slazata2024benchmark,\n  title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types},\n  author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt},\n  booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year = {2024},\n  url = {https://openreview.net/forum?id=WTI4RJYSVm}\n}\n"
+    },
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/metrics/mean_rowwise_error/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad",
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e",
     "maximize": false
   },
   {
@@ -17,10 +19,12 @@
     "metric_name": "Mean Rowwise MAE",
     "metric_summary": "The mean of the absolute error (MAE) of each row in the matrix.",
     "metric_description": "We use the **Mean Rowwise Absolute Error** to score submissions, computed as follows:\n\n$$\n\\textrm{MRMAE} = \\frac{1}{R}\\sum_{i=1}^R\\left(\\frac{1}{n} \\sum_{j=1}^{n} |y_{ij} - \\widehat{y}_{ij}|\\right)\n$$\n\nwhere $(R)$ is the number of scored rows, and $(y_{ij})$ and $(\\widehat{y}_{ij})$ are the actual and predicted values, respectively, for row $(i)$ and column $(j)$, and $(n)$ bis the number of columns.\n",
-    "paper_reference": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/metrics/mean_rowwise_error/config.vsh.yaml",
+    "paper_reference": {
+      "bibtex": "@article{slazata2024benchmark,\n  title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types},\n  author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt},\n  booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year = {2024},\n  url = {https://openreview.net/forum?id=WTI4RJYSVm}\n}\n"
+    },
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/metrics/mean_rowwise_error/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad",
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e",
     "maximize": false
   },
   {
@@ -29,10 +33,12 @@
     "metric_name": "Mean Rowwise Pearson",
     "metric_summary": "The mean of Pearson correlations per row (perturbation).",
     "metric_description": "The **Mean Pearson Correlation** is computed as follows:\n\n$$\n\\textrm{Mean-Pearson} = \\frac{1}{R}\\sum_{i=1}^R\\frac{\\textrm{Cov}(\\mathbf{y}_i, \\mathbf{\\hat{y}}_i)}{\\textrm{Var}(\\mathbf{y}_i) \\cdot \\textrm{Var}(\\mathbf{\\hat{y}}_i)}\n$$\n\nwhere $(R)$ is the number of scored rows, and $(\\mathbf{y}_i)$ and $(\\mathbf{\\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.\n",
-    "paper_reference": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/metrics/mean_rowwise_correlation/config.vsh.yaml",
+    "paper_reference": {
+      "bibtex": "@article{slazata2024benchmark,\n  title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types},\n  author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt},\n  booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year = {2024},\n  url = {https://openreview.net/forum?id=WTI4RJYSVm}\n}\n"
+    },
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/metrics/mean_rowwise_correlation/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad",
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e",
     "maximize": true
   },
   {
@@ -41,10 +47,12 @@
     "metric_name": "Mean Rowwise Spearman",
     "metric_summary": "The mean of Spearman correlations per row (perturbation).",
     "metric_description": "The **Mean Spearman Correlation** is computed as follows:\n\n$$\n\\textrm{Mean-Pearson} = \\frac{1}{R}\\sum_{i=1}^R\\frac{\\textrm{Cov}(\\mathbf{r}_i, \\mathbf{\\hat{r}}_i)}{\\textrm{Var}(\\mathbf{r}_i) \\cdot \\textrm{Var}(\\mathbf{\\hat{r}}_i)}\n$$\n\nwhere $(R)$ is the number of scored rows, and $(\\mathbf{r}_i)$ and $(\\mathbf{\\hat{r}}_i)$ are the ranks of the actual and predicted values, respectively, for row $(i)$.\n",
-    "paper_reference": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/metrics/mean_rowwise_correlation/config.vsh.yaml",
+    "paper_reference": {
+      "bibtex": "@article{slazata2024benchmark,\n  title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types},\n  author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt},\n  booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year = {2024},\n  url = {https://openreview.net/forum?id=WTI4RJYSVm}\n}\n"
+    },
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/metrics/mean_rowwise_correlation/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad",
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e",
     "maximize": true
   },
   {
@@ -53,10 +61,12 @@
     "metric_name": "Mean Rowwise Cosine",
     "metric_summary": "The mean of cosine similarities per row (perturbation).",
     "metric_description": "The **Mean Cosine Similarity** is computed as follows:\n\n$$\n\\textrm{Mean-Cosine} = \\frac{1}{R}\\sum_{i=1}^R\\frac{\\mathbf{y}_i\\cdot \\mathbf{\\hat{y}}_i}{\\|\\mathbf{y}_i\\| \\|\\mathbf{\\hat{y}}_i\\|}\n$$\n\nwhere $(R)$ is the number of scored rows, and $(\\mathbf{y}_i)$ and $(\\mathbf{\\hat{y}}_i)$ are the actual and predicted values, respectively, for row $(i)$.\n",
-    "paper_reference": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/metrics/mean_rowwise_correlation/config.vsh.yaml",
+    "paper_reference": {
+      "bibtex": "@article{slazata2024benchmark,\n  title = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types},\n  author = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt},\n  booktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n  year = {2024},\n  url = {https://openreview.net/forum?id=WTI4RJYSVm}\n}\n"
+    },
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/metrics/mean_rowwise_correlation/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad",
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e",
     "maximize": true
   }
 ]
diff --git a/results/perturbation_prediction/data/quality_control.json b/results/perturbation_prediction/data/quality_control.json
index e50297b4..a9095174 100644
--- a/results/perturbation_prediction/data/quality_control.json
+++ b/results/perturbation_prediction/data/quality_control.json
@@ -1,1632 +1,1632 @@
 [
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Task info", 
         "name": "Pct 'task_id' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing([task_info], field)", 
-        "message": "Task metadata field 'task_id' should be defined\n  Task id: perturbation_prediction\n  Field: task_id\n"
+        "message": "Task metadata field 'task_id' should be defined\n  Task id: task_perturbation_prediction\n  Field: task_id\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Task info", 
         "name": "Pct 'task_name' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing([task_info], field)", 
-        "message": "Task metadata field 'task_name' should be defined\n  Task id: perturbation_prediction\n  Field: task_name\n"
+        "message": "Task metadata field 'task_name' should be defined\n  Task id: task_perturbation_prediction\n  Field: task_name\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Task info", 
         "name": "Pct 'task_summary' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing([task_info], field)", 
-        "message": "Task metadata field 'task_summary' should be defined\n  Task id: perturbation_prediction\n  Field: task_summary\n"
+        "message": "Task metadata field 'task_summary' should be defined\n  Task id: task_perturbation_prediction\n  Field: task_summary\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Task info", 
         "name": "Pct 'task_description' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing([task_info], field)", 
-        "message": "Task metadata field 'task_description' should be defined\n  Task id: perturbation_prediction\n  Field: task_description\n"
+        "message": "Task metadata field 'task_description' should be defined\n  Task id: task_perturbation_prediction\n  Field: task_description\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Method info", 
         "name": "Pct 'task_id' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(method_info, field)", 
-        "message": "Method metadata field 'task_id' should be defined\n  Task id: perturbation_prediction\n  Field: task_id\n"
+        "message": "Method metadata field 'task_id' should be defined\n  Task id: task_perturbation_prediction\n  Field: task_id\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Method info", 
         "name": "Pct 'commit_sha' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(method_info, field)", 
-        "message": "Method metadata field 'commit_sha' should be defined\n  Task id: perturbation_prediction\n  Field: commit_sha\n"
+        "message": "Method metadata field 'commit_sha' should be defined\n  Task id: task_perturbation_prediction\n  Field: commit_sha\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Method info", 
         "name": "Pct 'method_id' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(method_info, field)", 
-        "message": "Method metadata field 'method_id' should be defined\n  Task id: perturbation_prediction\n  Field: method_id\n"
+        "message": "Method metadata field 'method_id' should be defined\n  Task id: task_perturbation_prediction\n  Field: method_id\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Method info", 
         "name": "Pct 'method_name' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(method_info, field)", 
-        "message": "Method metadata field 'method_name' should be defined\n  Task id: perturbation_prediction\n  Field: method_name\n"
+        "message": "Method metadata field 'method_name' should be defined\n  Task id: task_perturbation_prediction\n  Field: method_name\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Method info", 
         "name": "Pct 'method_summary' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(method_info, field)", 
-        "message": "Method metadata field 'method_summary' should be defined\n  Task id: perturbation_prediction\n  Field: method_summary\n"
+        "message": "Method metadata field 'method_summary' should be defined\n  Task id: task_perturbation_prediction\n  Field: method_summary\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Method info", 
         "name": "Pct 'paper_reference' missing", 
-        "value": 0.4166666666666667, 
+        "value": 0.5, 
         "severity": 2, 
         "severity_value": 3.0, 
         "code": "percent_missing(method_info, field)", 
-        "message": "Method metadata field 'paper_reference' should be defined\n  Task id: perturbation_prediction\n  Field: paper_reference\n"
+        "message": "Method metadata field 'paper_reference' should be defined\n  Task id: task_perturbation_prediction\n  Field: paper_reference\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Method info", 
         "name": "Pct 'is_baseline' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(method_info, field)", 
-        "message": "Method metadata field 'is_baseline' should be defined\n  Task id: perturbation_prediction\n  Field: is_baseline\n"
+        "message": "Method metadata field 'is_baseline' should be defined\n  Task id: task_perturbation_prediction\n  Field: is_baseline\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Metric info", 
         "name": "Pct 'task_id' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(metric_info, field)", 
-        "message": "Metric metadata field 'task_id' should be defined\n  Task id: perturbation_prediction\n  Field: task_id\n"
+        "message": "Metric metadata field 'task_id' should be defined\n  Task id: task_perturbation_prediction\n  Field: task_id\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Metric info", 
         "name": "Pct 'commit_sha' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(metric_info, field)", 
-        "message": "Metric metadata field 'commit_sha' should be defined\n  Task id: perturbation_prediction\n  Field: commit_sha\n"
+        "message": "Metric metadata field 'commit_sha' should be defined\n  Task id: task_perturbation_prediction\n  Field: commit_sha\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Metric info", 
         "name": "Pct 'metric_id' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(metric_info, field)", 
-        "message": "Metric metadata field 'metric_id' should be defined\n  Task id: perturbation_prediction\n  Field: metric_id\n"
+        "message": "Metric metadata field 'metric_id' should be defined\n  Task id: task_perturbation_prediction\n  Field: metric_id\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Metric info", 
         "name": "Pct 'metric_name' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(metric_info, field)", 
-        "message": "Metric metadata field 'metric_name' should be defined\n  Task id: perturbation_prediction\n  Field: metric_name\n"
+        "message": "Metric metadata field 'metric_name' should be defined\n  Task id: task_perturbation_prediction\n  Field: metric_name\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Metric info", 
         "name": "Pct 'metric_summary' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(metric_info, field)", 
-        "message": "Metric metadata field 'metric_summary' should be defined\n  Task id: perturbation_prediction\n  Field: metric_summary\n"
+        "message": "Metric metadata field 'metric_summary' should be defined\n  Task id: task_perturbation_prediction\n  Field: metric_summary\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Metric info", 
         "name": "Pct 'paper_reference' missing", 
-        "value": 1.0, 
-        "severity": 2, 
-        "severity_value": 3.0, 
+        "value": 0.0, 
+        "severity": 0, 
+        "severity_value": 0.0, 
         "code": "percent_missing(metric_info, field)", 
-        "message": "Metric metadata field 'paper_reference' should be defined\n  Task id: perturbation_prediction\n  Field: paper_reference\n"
+        "message": "Metric metadata field 'paper_reference' should be defined\n  Task id: task_perturbation_prediction\n  Field: paper_reference\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Metric info", 
         "name": "Pct 'maximize' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(metric_info, field)", 
-        "message": "Metric metadata field 'maximize' should be defined\n  Task id: perturbation_prediction\n  Field: maximize\n"
+        "message": "Metric metadata field 'maximize' should be defined\n  Task id: task_perturbation_prediction\n  Field: maximize\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Dataset info", 
         "name": "Pct 'task_id' missing", 
-        "value": 0.0, 
-        "severity": 0, 
-        "severity_value": 0.0, 
+        "value": 1.0, 
+        "severity": 2, 
+        "severity_value": 3.0, 
         "code": "percent_missing(dataset_info, field)", 
-        "message": "Dataset metadata field 'task_id' should be defined\n  Task id: perturbation_prediction\n  Field: task_id\n"
+        "message": "Dataset metadata field 'task_id' should be defined\n  Task id: task_perturbation_prediction\n  Field: task_id\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Dataset info", 
         "name": "Pct 'dataset_id' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(dataset_info, field)", 
-        "message": "Dataset metadata field 'dataset_id' should be defined\n  Task id: perturbation_prediction\n  Field: dataset_id\n"
+        "message": "Dataset metadata field 'dataset_id' should be defined\n  Task id: task_perturbation_prediction\n  Field: dataset_id\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Dataset info", 
         "name": "Pct 'dataset_name' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(dataset_info, field)", 
-        "message": "Dataset metadata field 'dataset_name' should be defined\n  Task id: perturbation_prediction\n  Field: dataset_name\n"
+        "message": "Dataset metadata field 'dataset_name' should be defined\n  Task id: task_perturbation_prediction\n  Field: dataset_name\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Dataset info", 
         "name": "Pct 'dataset_summary' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(dataset_info, field)", 
-        "message": "Dataset metadata field 'dataset_summary' should be defined\n  Task id: perturbation_prediction\n  Field: dataset_summary\n"
+        "message": "Dataset metadata field 'dataset_summary' should be defined\n  Task id: task_perturbation_prediction\n  Field: dataset_summary\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Dataset info", 
         "name": "Pct 'data_reference' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(dataset_info, field)", 
-        "message": "Dataset metadata field 'data_reference' should be defined\n  Task id: perturbation_prediction\n  Field: data_reference\n"
+        "message": "Dataset metadata field 'data_reference' should be defined\n  Task id: task_perturbation_prediction\n  Field: data_reference\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Dataset info", 
         "name": "Pct 'data_url' missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "percent_missing(dataset_info, field)", 
-        "message": "Dataset metadata field 'data_url' should be defined\n  Task id: perturbation_prediction\n  Field: data_url\n"
+        "message": "Dataset metadata field 'data_url' should be defined\n  Task id: task_perturbation_prediction\n  Field: data_url\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw data", 
         "name": "Number of results", 
         "value": 12, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "len(results) == len(method_info) * len(metric_info) * len(dataset_info)", 
-        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: perturbation_prediction\n  Number of results: 12\n  Number of methods: 12\n  Number of metrics: 5\n  Number of datasets: 1\n"
+        "message": "Number of results should be equal to #methods × #metrics × #datasets.\n  Task id: task_perturbation_prediction\n  Number of results: 12\n  Number of methods: 12\n  Number of metrics: 5\n  Number of datasets: 1\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Metric 'mean_rowwise_rmse' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  Metric id: mean_rowwise_rmse\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  Metric id: mean_rowwise_rmse\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Metric 'mean_rowwise_mae' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  Metric id: mean_rowwise_mae\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  Metric id: mean_rowwise_mae\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Metric 'mean_rowwise_pearson' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  Metric id: mean_rowwise_pearson\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  Metric id: mean_rowwise_pearson\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Metric 'mean_rowwise_spearman' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  Metric id: mean_rowwise_spearman\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  Metric id: mean_rowwise_spearman\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Metric 'mean_rowwise_cosine' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  Metric id: mean_rowwise_cosine\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  Metric id: mean_rowwise_cosine\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'ground_truth' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: ground_truth\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: ground_truth\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'mean_outcome' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: mean_outcome\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: mean_outcome\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'mean_across_celltypes' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: mean_across_celltypes\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: mean_across_celltypes\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'mean_across_compounds' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: mean_across_compounds\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: mean_across_compounds\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'sample' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: sample\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: sample\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'zeros' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: zeros\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: zeros\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'lgc_ensemble' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: lgc_ensemble\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: lgc_ensemble\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'nn_retraining_with_pseudolabels' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: nn_retraining_with_pseudolabels\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: nn_retraining_with_pseudolabels\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'jn_ap_op2' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: jn_ap_op2\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: jn_ap_op2\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'scape' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: scape\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: scape\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'transformer_ensemble' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: transformer_ensemble\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: transformer_ensemble\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Method 'pyboost' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  method id: pyboost\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  method id: pyboost\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Raw results", 
         "name": "Dataset 'neurips-2023-data' %missing", 
         "value": 0.0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "pct_missing <= .1", 
-        "message": "Percentage of missing results should be less than 10%.\n  Task id: perturbation_prediction\n  dataset id: neurips-2023-data\n  Percentage missing: 0%\n"
+        "message": "Percentage of missing results should be less than 10%.\n  Task id: task_perturbation_prediction\n  dataset id: neurips-2023-data\n  Percentage missing: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score ground_truth mean_rowwise_rmse", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method ground_truth performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_rmse\n  Worst score: 1%\n"
+        "message": "Method ground_truth performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_rmse\n  Worst score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score ground_truth mean_rowwise_rmse", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_rmse\n  Best score: 1%\n"
+        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_rmse\n  Best score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_outcome mean_rowwise_rmse", 
-        "value": 0.3403, 
+        "value": 0.3405, 
         "severity": 0, 
-        "severity_value": -0.3403, 
+        "severity_value": -0.3405, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3403%\n"
+        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3405%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_outcome mean_rowwise_rmse", 
-        "value": 0.3403, 
+        "value": 0.3405, 
         "severity": 0, 
-        "severity_value": 0.17015, 
+        "severity_value": 0.17025, 
         "code": "best_score <= 2", 
-        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3403%\n"
+        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3405%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_celltypes mean_rowwise_rmse", 
-        "value": 0.3453, 
+        "value": 0.3455, 
         "severity": 0, 
-        "severity_value": -0.3453, 
+        "severity_value": -0.3455, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3453%\n"
+        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3455%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_celltypes mean_rowwise_rmse", 
-        "value": 0.3453, 
+        "value": 0.3455, 
         "severity": 0, 
-        "severity_value": 0.17265, 
+        "severity_value": 0.17275, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3453%\n"
+        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3455%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_compounds mean_rowwise_rmse", 
-        "value": 0.3083, 
+        "value": 0.3086, 
         "severity": 0, 
-        "severity_value": -0.3083, 
+        "severity_value": -0.3086, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3083%\n"
+        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3086%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_compounds mean_rowwise_rmse", 
-        "value": 0.3083, 
+        "value": 0.3086, 
         "severity": 0, 
-        "severity_value": 0.15415, 
+        "severity_value": 0.1543, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3083%\n"
+        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3086%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score sample mean_rowwise_rmse", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method sample performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_rmse\n  Worst score: 0%\n"
+        "message": "Method sample performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_rmse\n  Worst score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score sample mean_rowwise_rmse", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method sample performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_rmse\n  Best score: 0%\n"
+        "message": "Method sample performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_rmse\n  Best score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score zeros mean_rowwise_rmse", 
-        "value": 0.3266, 
+        "value": 0.3268, 
         "severity": 0, 
-        "severity_value": -0.3266, 
+        "severity_value": -0.3268, 
         "code": "worst_score >= -1", 
-        "message": "Method zeros performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3266%\n"
+        "message": "Method zeros performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3268%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score zeros mean_rowwise_rmse", 
-        "value": 0.3266, 
+        "value": 0.3268, 
         "severity": 0, 
-        "severity_value": 0.1633, 
+        "severity_value": 0.1634, 
         "code": "best_score <= 2", 
-        "message": "Method zeros performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3266%\n"
+        "message": "Method zeros performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3268%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score lgc_ensemble mean_rowwise_rmse", 
-        "value": 0.4119, 
+        "value": 0.4191, 
         "severity": 0, 
-        "severity_value": -0.4119, 
+        "severity_value": -0.4191, 
         "code": "worst_score >= -1", 
-        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4119%\n"
+        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4191%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score lgc_ensemble mean_rowwise_rmse", 
-        "value": 0.4119, 
+        "value": 0.4191, 
         "severity": 0, 
-        "severity_value": 0.20595, 
+        "severity_value": 0.20955, 
         "code": "best_score <= 2", 
-        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4119%\n"
+        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4191%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score nn_retraining_with_pseudolabels mean_rowwise_rmse", 
-        "value": 0.4448, 
+        "value": 0.4455, 
         "severity": 0, 
-        "severity_value": -0.4448, 
+        "severity_value": -0.4455, 
         "code": "worst_score >= -1", 
-        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4448%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4455%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score nn_retraining_with_pseudolabels mean_rowwise_rmse", 
-        "value": 0.4448, 
+        "value": 0.4455, 
         "severity": 0, 
-        "severity_value": 0.2224, 
+        "severity_value": 0.22275, 
         "code": "best_score <= 2", 
-        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4448%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4455%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score jn_ap_op2 mean_rowwise_rmse", 
-        "value": 0.3442, 
+        "value": 0.3425, 
         "severity": 0, 
-        "severity_value": -0.3442, 
+        "severity_value": -0.3425, 
         "code": "worst_score >= -1", 
-        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3442%\n"
+        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3425%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score jn_ap_op2 mean_rowwise_rmse", 
-        "value": 0.3442, 
+        "value": 0.3425, 
         "severity": 0, 
-        "severity_value": 0.1721, 
+        "severity_value": 0.17125, 
         "code": "best_score <= 2", 
-        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3442%\n"
+        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3425%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score scape mean_rowwise_rmse", 
-        "value": 0.4311, 
+        "value": 0.4318, 
         "severity": 0, 
-        "severity_value": -0.4311, 
+        "severity_value": -0.4318, 
         "code": "worst_score >= -1", 
-        "message": "Method scape performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4311%\n"
+        "message": "Method scape performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4318%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score scape mean_rowwise_rmse", 
-        "value": 0.4311, 
+        "value": 0.4318, 
         "severity": 0, 
-        "severity_value": 0.21555, 
+        "severity_value": 0.2159, 
         "code": "best_score <= 2", 
-        "message": "Method scape performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4311%\n"
+        "message": "Method scape performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4318%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score transformer_ensemble mean_rowwise_rmse", 
-        "value": 0.3417, 
+        "value": 0.3408, 
         "severity": 0, 
-        "severity_value": -0.3417, 
+        "severity_value": -0.3408, 
         "code": "worst_score >= -1", 
-        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3417%\n"
+        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.3408%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score transformer_ensemble mean_rowwise_rmse", 
-        "value": 0.3417, 
+        "value": 0.3408, 
         "severity": 0, 
-        "severity_value": 0.17085, 
+        "severity_value": 0.1704, 
         "code": "best_score <= 2", 
-        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3417%\n"
+        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_rmse\n  Best score: 0.3408%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score pyboost mean_rowwise_rmse", 
-        "value": 0.4168, 
+        "value": 0.4165, 
         "severity": 0, 
-        "severity_value": -0.4168, 
+        "severity_value": -0.4165, 
         "code": "worst_score >= -1", 
-        "message": "Method pyboost performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4168%\n"
+        "message": "Method pyboost performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_rmse\n  Worst score: 0.4165%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score pyboost mean_rowwise_rmse", 
-        "value": 0.4168, 
+        "value": 0.4165, 
         "severity": 0, 
-        "severity_value": 0.2084, 
+        "severity_value": 0.20825, 
         "code": "best_score <= 2", 
-        "message": "Method pyboost performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4168%\n"
+        "message": "Method pyboost performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_rmse\n  Best score: 0.4165%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score ground_truth mean_rowwise_mae", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method ground_truth performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_mae\n  Worst score: 1%\n"
+        "message": "Method ground_truth performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_mae\n  Worst score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score ground_truth mean_rowwise_mae", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_mae\n  Best score: 1%\n"
+        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_mae\n  Best score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_outcome mean_rowwise_mae", 
-        "value": 0.34, 
+        "value": 0.3401, 
         "severity": 0, 
-        "severity_value": -0.34, 
+        "severity_value": -0.3401, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_mae\n  Worst score: 0.34%\n"
+        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3401%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_outcome mean_rowwise_mae", 
-        "value": 0.34, 
+        "value": 0.3401, 
         "severity": 0, 
-        "severity_value": 0.17, 
+        "severity_value": 0.17005, 
         "code": "best_score <= 2", 
-        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_mae\n  Best score: 0.34%\n"
+        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_mae\n  Best score: 0.3401%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_celltypes mean_rowwise_mae", 
-        "value": 0.3316, 
+        "value": 0.3318, 
         "severity": 0, 
-        "severity_value": -0.3316, 
+        "severity_value": -0.3318, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3316%\n"
+        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3318%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_celltypes mean_rowwise_mae", 
-        "value": 0.3316, 
+        "value": 0.3318, 
         "severity": 0, 
-        "severity_value": 0.1658, 
+        "severity_value": 0.1659, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_mae\n  Best score: 0.3316%\n"
+        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_mae\n  Best score: 0.3318%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_compounds mean_rowwise_mae", 
-        "value": 0.2754, 
+        "value": 0.2755, 
         "severity": 0, 
-        "severity_value": -0.2754, 
+        "severity_value": -0.2755, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_mae\n  Worst score: 0.2754%\n"
+        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_mae\n  Worst score: 0.2755%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_compounds mean_rowwise_mae", 
-        "value": 0.2754, 
+        "value": 0.2755, 
         "severity": 0, 
-        "severity_value": 0.1377, 
+        "severity_value": 0.13775, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_mae\n  Best score: 0.2754%\n"
+        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_mae\n  Best score: 0.2755%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score sample mean_rowwise_mae", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method sample performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_mae\n  Worst score: 0%\n"
+        "message": "Method sample performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_mae\n  Worst score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score sample mean_rowwise_mae", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method sample performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_mae\n  Best score: 0%\n"
+        "message": "Method sample performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_mae\n  Best score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score zeros mean_rowwise_mae", 
-        "value": 0.3406, 
+        "value": 0.3407, 
         "severity": 0, 
-        "severity_value": -0.3406, 
+        "severity_value": -0.3407, 
         "code": "worst_score >= -1", 
-        "message": "Method zeros performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3406%\n"
+        "message": "Method zeros performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3407%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score zeros mean_rowwise_mae", 
-        "value": 0.3406, 
+        "value": 0.3407, 
         "severity": 0, 
-        "severity_value": 0.1703, 
+        "severity_value": 0.17035, 
         "code": "best_score <= 2", 
-        "message": "Method zeros performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_mae\n  Best score: 0.3406%\n"
+        "message": "Method zeros performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_mae\n  Best score: 0.3407%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score lgc_ensemble mean_rowwise_mae", 
-        "value": 0.3936, 
+        "value": 0.4024, 
         "severity": 0, 
-        "severity_value": -0.3936, 
+        "severity_value": -0.4024, 
         "code": "worst_score >= -1", 
-        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3936%\n"
+        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_mae\n  Worst score: 0.4024%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score lgc_ensemble mean_rowwise_mae", 
-        "value": 0.3936, 
+        "value": 0.4024, 
         "severity": 0, 
-        "severity_value": 0.1968, 
+        "severity_value": 0.2012, 
         "code": "best_score <= 2", 
-        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_mae\n  Best score: 0.3936%\n"
+        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_mae\n  Best score: 0.4024%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score nn_retraining_with_pseudolabels mean_rowwise_mae", 
-        "value": 0.4321, 
+        "value": 0.4328, 
         "severity": 0, 
-        "severity_value": -0.4321, 
+        "severity_value": -0.4328, 
         "code": "worst_score >= -1", 
-        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_mae\n  Worst score: 0.4321%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_mae\n  Worst score: 0.4328%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score nn_retraining_with_pseudolabels mean_rowwise_mae", 
-        "value": 0.4321, 
+        "value": 0.4328, 
         "severity": 0, 
-        "severity_value": 0.21605, 
+        "severity_value": 0.2164, 
         "code": "best_score <= 2", 
-        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_mae\n  Best score: 0.4321%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_mae\n  Best score: 0.4328%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score jn_ap_op2 mean_rowwise_mae", 
-        "value": 0.3258, 
+        "value": 0.3233, 
         "severity": 0, 
-        "severity_value": -0.3258, 
+        "severity_value": -0.3233, 
         "code": "worst_score >= -1", 
-        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3258%\n"
+        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3233%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score jn_ap_op2 mean_rowwise_mae", 
-        "value": 0.3258, 
+        "value": 0.3233, 
         "severity": 0, 
-        "severity_value": 0.1629, 
+        "severity_value": 0.16165, 
         "code": "best_score <= 2", 
-        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_mae\n  Best score: 0.3258%\n"
+        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_mae\n  Best score: 0.3233%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score scape mean_rowwise_mae", 
-        "value": 0.4055, 
+        "value": 0.406, 
         "severity": 0, 
-        "severity_value": -0.4055, 
+        "severity_value": -0.406, 
         "code": "worst_score >= -1", 
-        "message": "Method scape performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_mae\n  Worst score: 0.4055%\n"
+        "message": "Method scape performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_mae\n  Worst score: 0.406%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score scape mean_rowwise_mae", 
-        "value": 0.4055, 
+        "value": 0.406, 
         "severity": 0, 
-        "severity_value": 0.20275, 
+        "severity_value": 0.203, 
         "code": "best_score <= 2", 
-        "message": "Method scape performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_mae\n  Best score: 0.4055%\n"
+        "message": "Method scape performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_mae\n  Best score: 0.406%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score transformer_ensemble mean_rowwise_mae", 
-        "value": 0.3483, 
+        "value": 0.342, 
         "severity": 0, 
-        "severity_value": -0.3483, 
+        "severity_value": -0.342, 
         "code": "worst_score >= -1", 
-        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_mae\n  Worst score: 0.3483%\n"
+        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_mae\n  Worst score: 0.342%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score transformer_ensemble mean_rowwise_mae", 
-        "value": 0.3483, 
+        "value": 0.342, 
         "severity": 0, 
-        "severity_value": 0.17415, 
+        "severity_value": 0.171, 
         "code": "best_score <= 2", 
-        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_mae\n  Best score: 0.3483%\n"
+        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_mae\n  Best score: 0.342%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score pyboost mean_rowwise_mae", 
-        "value": 0.4182, 
+        "value": 0.4178, 
         "severity": 0, 
-        "severity_value": -0.4182, 
+        "severity_value": -0.4178, 
         "code": "worst_score >= -1", 
-        "message": "Method pyboost performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_mae\n  Worst score: 0.4182%\n"
+        "message": "Method pyboost performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_mae\n  Worst score: 0.4178%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score pyboost mean_rowwise_mae", 
-        "value": 0.4182, 
+        "value": 0.4178, 
         "severity": 0, 
-        "severity_value": 0.2091, 
+        "severity_value": 0.2089, 
         "code": "best_score <= 2", 
-        "message": "Method pyboost performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_mae\n  Best score: 0.4182%\n"
+        "message": "Method pyboost performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_mae\n  Best score: 0.4178%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score ground_truth mean_rowwise_pearson", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method ground_truth performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_pearson\n  Worst score: 1%\n"
+        "message": "Method ground_truth performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_pearson\n  Worst score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score ground_truth mean_rowwise_pearson", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_pearson\n  Best score: 1%\n"
+        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_pearson\n  Best score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_outcome mean_rowwise_pearson", 
         "value": 0.2198, 
         "severity": 0, 
         "severity_value": -0.2198, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2198%\n"
+        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2198%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_outcome mean_rowwise_pearson", 
         "value": 0.2198, 
         "severity": 0, 
         "severity_value": 0.1099, 
         "code": "best_score <= 2", 
-        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2198%\n"
+        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2198%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_celltypes mean_rowwise_pearson", 
         "value": 0.2972, 
         "severity": 0, 
         "severity_value": -0.2972, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2972%\n"
+        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2972%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_celltypes mean_rowwise_pearson", 
         "value": 0.2972, 
         "severity": 0, 
         "severity_value": 0.1486, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2972%\n"
+        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2972%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_compounds mean_rowwise_pearson", 
         "value": 0.2594, 
         "severity": 0, 
         "severity_value": -0.2594, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2594%\n"
+        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2594%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_compounds mean_rowwise_pearson", 
         "value": 0.2594, 
         "severity": 0, 
         "severity_value": 0.1297, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2594%\n"
+        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2594%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score sample mean_rowwise_pearson", 
-        "value": 0.0514, 
+        "value": 0.0524, 
         "severity": 0, 
-        "severity_value": -0.0514, 
+        "severity_value": -0.0524, 
         "code": "worst_score >= -1", 
-        "message": "Method sample performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.0514%\n"
+        "message": "Method sample performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.0524%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score sample mean_rowwise_pearson", 
-        "value": 0.0514, 
+        "value": 0.0524, 
         "severity": 0, 
-        "severity_value": 0.0257, 
+        "severity_value": 0.0262, 
         "code": "best_score <= 2", 
-        "message": "Method sample performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_pearson\n  Best score: 0.0514%\n"
+        "message": "Method sample performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_pearson\n  Best score: 0.0524%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score zeros mean_rowwise_pearson", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method zeros performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_pearson\n  Worst score: 0%\n"
+        "message": "Method zeros performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_pearson\n  Worst score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score zeros mean_rowwise_pearson", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method zeros performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_pearson\n  Best score: 0%\n"
+        "message": "Method zeros performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_pearson\n  Best score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score lgc_ensemble mean_rowwise_pearson", 
-        "value": 0.4503, 
+        "value": 0.4514, 
         "severity": 0, 
-        "severity_value": -0.4503, 
+        "severity_value": -0.4514, 
         "code": "worst_score >= -1", 
-        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.4503%\n"
+        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.4514%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score lgc_ensemble mean_rowwise_pearson", 
-        "value": 0.4503, 
+        "value": 0.4514, 
         "severity": 0, 
-        "severity_value": 0.22515, 
+        "severity_value": 0.2257, 
         "code": "best_score <= 2", 
-        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_pearson\n  Best score: 0.4503%\n"
+        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_pearson\n  Best score: 0.4514%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score nn_retraining_with_pseudolabels mean_rowwise_pearson", 
-        "value": 0.49, 
+        "value": 0.4909, 
         "severity": 0, 
-        "severity_value": -0.49, 
+        "severity_value": -0.4909, 
         "code": "worst_score >= -1", 
-        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.49%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.4909%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score nn_retraining_with_pseudolabels mean_rowwise_pearson", 
-        "value": 0.49, 
+        "value": 0.4909, 
         "severity": 0, 
-        "severity_value": 0.245, 
+        "severity_value": 0.24545, 
         "code": "best_score <= 2", 
-        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_pearson\n  Best score: 0.49%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_pearson\n  Best score: 0.4909%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score jn_ap_op2 mean_rowwise_pearson", 
         "value": 0.3267, 
         "severity": 0, 
         "severity_value": -0.3267, 
         "code": "worst_score >= -1", 
-        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.3267%\n"
+        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.3267%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score jn_ap_op2 mean_rowwise_pearson", 
         "value": 0.3267, 
         "severity": 0, 
         "severity_value": 0.16335, 
         "code": "best_score <= 2", 
-        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_pearson\n  Best score: 0.3267%\n"
+        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_pearson\n  Best score: 0.3267%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score scape mean_rowwise_pearson", 
-        "value": 0.4724, 
+        "value": 0.4728, 
         "severity": 0, 
-        "severity_value": -0.4724, 
+        "severity_value": -0.4728, 
         "code": "worst_score >= -1", 
-        "message": "Method scape performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.4724%\n"
+        "message": "Method scape performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.4728%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score scape mean_rowwise_pearson", 
-        "value": 0.4724, 
+        "value": 0.4728, 
         "severity": 0, 
-        "severity_value": 0.2362, 
+        "severity_value": 0.2364, 
         "code": "best_score <= 2", 
-        "message": "Method scape performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_pearson\n  Best score: 0.4724%\n"
+        "message": "Method scape performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_pearson\n  Best score: 0.4728%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score transformer_ensemble mean_rowwise_pearson", 
-        "value": 0.2189, 
+        "value": 0.2212, 
         "severity": 0, 
-        "severity_value": -0.2189, 
+        "severity_value": -0.2212, 
         "code": "worst_score >= -1", 
-        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2189%\n"
+        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.2212%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score transformer_ensemble mean_rowwise_pearson", 
-        "value": 0.2189, 
+        "value": 0.2212, 
         "severity": 0, 
-        "severity_value": 0.10945, 
+        "severity_value": 0.1106, 
         "code": "best_score <= 2", 
-        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2189%\n"
+        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_pearson\n  Best score: 0.2212%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score pyboost mean_rowwise_pearson", 
-        "value": 0.462, 
+        "value": 0.4607, 
         "severity": 0, 
-        "severity_value": -0.462, 
+        "severity_value": -0.4607, 
         "code": "worst_score >= -1", 
-        "message": "Method pyboost performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.462%\n"
+        "message": "Method pyboost performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_pearson\n  Worst score: 0.4607%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score pyboost mean_rowwise_pearson", 
-        "value": 0.462, 
+        "value": 0.4607, 
         "severity": 0, 
-        "severity_value": 0.231, 
+        "severity_value": 0.23035, 
         "code": "best_score <= 2", 
-        "message": "Method pyboost performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_pearson\n  Best score: 0.462%\n"
+        "message": "Method pyboost performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_pearson\n  Best score: 0.4607%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score ground_truth mean_rowwise_spearman", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method ground_truth performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_spearman\n  Worst score: 1%\n"
+        "message": "Method ground_truth performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_spearman\n  Worst score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score ground_truth mean_rowwise_spearman", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_spearman\n  Best score: 1%\n"
+        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_spearman\n  Best score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_outcome mean_rowwise_spearman", 
         "value": 0.2117, 
         "severity": 0, 
         "severity_value": -0.2117, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2117%\n"
+        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2117%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_outcome mean_rowwise_spearman", 
         "value": 0.2117, 
         "severity": 0, 
         "severity_value": 0.10585, 
         "code": "best_score <= 2", 
-        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2117%\n"
+        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2117%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_celltypes mean_rowwise_spearman", 
         "value": 0.2806, 
         "severity": 0, 
         "severity_value": -0.2806, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2806%\n"
+        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2806%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_celltypes mean_rowwise_spearman", 
         "value": 0.2806, 
         "severity": 0, 
         "severity_value": 0.1403, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2806%\n"
+        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2806%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_compounds mean_rowwise_spearman", 
         "value": 0.2425, 
         "severity": 0, 
         "severity_value": -0.2425, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2425%\n"
+        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2425%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_compounds mean_rowwise_spearman", 
         "value": 0.2425, 
         "severity": 0, 
         "severity_value": 0.12125, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2425%\n"
+        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2425%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score sample mean_rowwise_spearman", 
-        "value": 0.0548, 
+        "value": 0.0562, 
         "severity": 0, 
-        "severity_value": -0.0548, 
+        "severity_value": -0.0562, 
         "code": "worst_score >= -1", 
-        "message": "Method sample performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.0548%\n"
+        "message": "Method sample performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.0562%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score sample mean_rowwise_spearman", 
-        "value": 0.0548, 
+        "value": 0.0562, 
         "severity": 0, 
-        "severity_value": 0.0274, 
+        "severity_value": 0.0281, 
         "code": "best_score <= 2", 
-        "message": "Method sample performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_spearman\n  Best score: 0.0548%\n"
+        "message": "Method sample performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_spearman\n  Best score: 0.0562%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score zeros mean_rowwise_spearman", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method zeros performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_spearman\n  Worst score: 0%\n"
+        "message": "Method zeros performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_spearman\n  Worst score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score zeros mean_rowwise_spearman", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method zeros performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_spearman\n  Best score: 0%\n"
+        "message": "Method zeros performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_spearman\n  Best score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score lgc_ensemble mean_rowwise_spearman", 
-        "value": 0.4238, 
+        "value": 0.4247, 
         "severity": 0, 
-        "severity_value": -0.4238, 
+        "severity_value": -0.4247, 
         "code": "worst_score >= -1", 
-        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.4238%\n"
+        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.4247%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score lgc_ensemble mean_rowwise_spearman", 
-        "value": 0.4238, 
+        "value": 0.4247, 
         "severity": 0, 
-        "severity_value": 0.2119, 
+        "severity_value": 0.21235, 
         "code": "best_score <= 2", 
-        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_spearman\n  Best score: 0.4238%\n"
+        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_spearman\n  Best score: 0.4247%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score nn_retraining_with_pseudolabels mean_rowwise_spearman", 
-        "value": 0.4617, 
+        "value": 0.4627, 
         "severity": 0, 
-        "severity_value": -0.4617, 
+        "severity_value": -0.4627, 
         "code": "worst_score >= -1", 
-        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.4617%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.4627%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score nn_retraining_with_pseudolabels mean_rowwise_spearman", 
-        "value": 0.4617, 
+        "value": 0.4627, 
         "severity": 0, 
-        "severity_value": 0.23085, 
+        "severity_value": 0.23135, 
         "code": "best_score <= 2", 
-        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_spearman\n  Best score: 0.4617%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_spearman\n  Best score: 0.4627%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score jn_ap_op2 mean_rowwise_spearman", 
-        "value": 0.3061, 
+        "value": 0.3054, 
         "severity": 0, 
-        "severity_value": -0.3061, 
+        "severity_value": -0.3054, 
         "code": "worst_score >= -1", 
-        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.3061%\n"
+        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.3054%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score jn_ap_op2 mean_rowwise_spearman", 
-        "value": 0.3061, 
+        "value": 0.3054, 
         "severity": 0, 
-        "severity_value": 0.15305, 
+        "severity_value": 0.1527, 
         "code": "best_score <= 2", 
-        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_spearman\n  Best score: 0.3061%\n"
+        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_spearman\n  Best score: 0.3054%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score scape mean_rowwise_spearman", 
-        "value": 0.4412, 
+        "value": 0.4417, 
         "severity": 0, 
-        "severity_value": -0.4412, 
+        "severity_value": -0.4417, 
         "code": "worst_score >= -1", 
-        "message": "Method scape performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.4412%\n"
+        "message": "Method scape performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.4417%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score scape mean_rowwise_spearman", 
-        "value": 0.4412, 
+        "value": 0.4417, 
         "severity": 0, 
-        "severity_value": 0.2206, 
+        "severity_value": 0.22085, 
         "code": "best_score <= 2", 
-        "message": "Method scape performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_spearman\n  Best score: 0.4412%\n"
+        "message": "Method scape performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_spearman\n  Best score: 0.4417%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score transformer_ensemble mean_rowwise_spearman", 
-        "value": 0.2142, 
+        "value": 0.2164, 
         "severity": 0, 
-        "severity_value": -0.2142, 
+        "severity_value": -0.2164, 
         "code": "worst_score >= -1", 
-        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2142%\n"
+        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.2164%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score transformer_ensemble mean_rowwise_spearman", 
-        "value": 0.2142, 
+        "value": 0.2164, 
         "severity": 0, 
-        "severity_value": 0.1071, 
+        "severity_value": 0.1082, 
         "code": "best_score <= 2", 
-        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2142%\n"
+        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_spearman\n  Best score: 0.2164%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score pyboost mean_rowwise_spearman", 
-        "value": 0.4405, 
+        "value": 0.439, 
         "severity": 0, 
-        "severity_value": -0.4405, 
+        "severity_value": -0.439, 
         "code": "worst_score >= -1", 
-        "message": "Method pyboost performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.4405%\n"
+        "message": "Method pyboost performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_spearman\n  Worst score: 0.439%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score pyboost mean_rowwise_spearman", 
-        "value": 0.4405, 
+        "value": 0.439, 
         "severity": 0, 
-        "severity_value": 0.22025, 
+        "severity_value": 0.2195, 
         "code": "best_score <= 2", 
-        "message": "Method pyboost performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_spearman\n  Best score: 0.4405%\n"
+        "message": "Method pyboost performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_spearman\n  Best score: 0.439%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score ground_truth mean_rowwise_cosine", 
         "value": 1, 
         "severity": 0, 
         "severity_value": -1.0, 
         "code": "worst_score >= -1", 
-        "message": "Method ground_truth performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_cosine\n  Worst score: 1%\n"
+        "message": "Method ground_truth performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_cosine\n  Worst score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score ground_truth mean_rowwise_cosine", 
         "value": 1, 
         "severity": 0, 
         "severity_value": 0.5, 
         "code": "best_score <= 2", 
-        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_cosine\n  Best score: 1%\n"
+        "message": "Method ground_truth performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: ground_truth\n  Metric id: mean_rowwise_cosine\n  Best score: 1%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_outcome mean_rowwise_cosine", 
         "value": 0.2264, 
         "severity": 0, 
         "severity_value": -0.2264, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.2264%\n"
+        "message": "Method mean_outcome performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.2264%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_outcome mean_rowwise_cosine", 
         "value": 0.2264, 
         "severity": 0, 
         "severity_value": 0.1132, 
         "code": "best_score <= 2", 
-        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_cosine\n  Best score: 0.2264%\n"
+        "message": "Method mean_outcome performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_outcome\n  Metric id: mean_rowwise_cosine\n  Best score: 0.2264%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_celltypes mean_rowwise_cosine", 
         "value": 0.3017, 
         "severity": 0, 
         "severity_value": -0.3017, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.3017%\n"
+        "message": "Method mean_across_celltypes performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.3017%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_celltypes mean_rowwise_cosine", 
         "value": 0.3017, 
         "severity": 0, 
         "severity_value": 0.15085, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_cosine\n  Best score: 0.3017%\n"
+        "message": "Method mean_across_celltypes performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_celltypes\n  Metric id: mean_rowwise_cosine\n  Best score: 0.3017%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score mean_across_compounds mean_rowwise_cosine", 
         "value": 0.2628, 
         "severity": 0, 
         "severity_value": -0.2628, 
         "code": "worst_score >= -1", 
-        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.2628%\n"
+        "message": "Method mean_across_compounds performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.2628%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score mean_across_compounds mean_rowwise_cosine", 
         "value": 0.2628, 
         "severity": 0, 
         "severity_value": 0.1314, 
         "code": "best_score <= 2", 
-        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_cosine\n  Best score: 0.2628%\n"
+        "message": "Method mean_across_compounds performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: mean_across_compounds\n  Metric id: mean_rowwise_cosine\n  Best score: 0.2628%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score sample mean_rowwise_cosine", 
-        "value": 0.0536, 
+        "value": 0.0547, 
         "severity": 0, 
-        "severity_value": -0.0536, 
+        "severity_value": -0.0547, 
         "code": "worst_score >= -1", 
-        "message": "Method sample performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.0536%\n"
+        "message": "Method sample performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.0547%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score sample mean_rowwise_cosine", 
-        "value": 0.0536, 
+        "value": 0.0547, 
         "severity": 0, 
-        "severity_value": 0.0268, 
+        "severity_value": 0.02735, 
         "code": "best_score <= 2", 
-        "message": "Method sample performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_cosine\n  Best score: 0.0536%\n"
+        "message": "Method sample performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: sample\n  Metric id: mean_rowwise_cosine\n  Best score: 0.0547%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score zeros mean_rowwise_cosine", 
         "value": 0, 
         "severity": 0, 
         "severity_value": -0.0, 
         "code": "worst_score >= -1", 
-        "message": "Method zeros performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_cosine\n  Worst score: 0%\n"
+        "message": "Method zeros performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_cosine\n  Worst score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score zeros mean_rowwise_cosine", 
         "value": 0, 
         "severity": 0, 
         "severity_value": 0.0, 
         "code": "best_score <= 2", 
-        "message": "Method zeros performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_cosine\n  Best score: 0%\n"
+        "message": "Method zeros performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: zeros\n  Metric id: mean_rowwise_cosine\n  Best score: 0%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score lgc_ensemble mean_rowwise_cosine", 
-        "value": 0.4542, 
+        "value": 0.4552, 
         "severity": 0, 
-        "severity_value": -0.4542, 
+        "severity_value": -0.4552, 
         "code": "worst_score >= -1", 
-        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4542%\n"
+        "message": "Method lgc_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4552%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score lgc_ensemble mean_rowwise_cosine", 
-        "value": 0.4542, 
+        "value": 0.4552, 
         "severity": 0, 
-        "severity_value": 0.2271, 
+        "severity_value": 0.2276, 
         "code": "best_score <= 2", 
-        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4542%\n"
+        "message": "Method lgc_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: lgc_ensemble\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4552%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score nn_retraining_with_pseudolabels mean_rowwise_cosine", 
-        "value": 0.4932, 
+        "value": 0.4941, 
         "severity": 0, 
-        "severity_value": -0.4932, 
+        "severity_value": -0.4941, 
         "code": "worst_score >= -1", 
-        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4932%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4941%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score nn_retraining_with_pseudolabels mean_rowwise_cosine", 
-        "value": 0.4932, 
+        "value": 0.4941, 
         "severity": 0, 
-        "severity_value": 0.2466, 
+        "severity_value": 0.24705, 
         "code": "best_score <= 2", 
-        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4932%\n"
+        "message": "Method nn_retraining_with_pseudolabels performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: nn_retraining_with_pseudolabels\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4941%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score jn_ap_op2 mean_rowwise_cosine", 
-        "value": 0.3291, 
+        "value": 0.3289, 
         "severity": 0, 
-        "severity_value": -0.3291, 
+        "severity_value": -0.3289, 
         "code": "worst_score >= -1", 
-        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.3291%\n"
+        "message": "Method jn_ap_op2 performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.3289%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score jn_ap_op2 mean_rowwise_cosine", 
-        "value": 0.3291, 
+        "value": 0.3289, 
         "severity": 0, 
-        "severity_value": 0.16455, 
+        "severity_value": 0.16445, 
         "code": "best_score <= 2", 
-        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_cosine\n  Best score: 0.3291%\n"
+        "message": "Method jn_ap_op2 performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: jn_ap_op2\n  Metric id: mean_rowwise_cosine\n  Best score: 0.3289%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score scape mean_rowwise_cosine", 
-        "value": 0.4754, 
+        "value": 0.4758, 
         "severity": 0, 
-        "severity_value": -0.4754, 
+        "severity_value": -0.4758, 
         "code": "worst_score >= -1", 
-        "message": "Method scape performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4754%\n"
+        "message": "Method scape performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4758%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score scape mean_rowwise_cosine", 
-        "value": 0.4754, 
+        "value": 0.4758, 
         "severity": 0, 
-        "severity_value": 0.2377, 
+        "severity_value": 0.2379, 
         "code": "best_score <= 2", 
-        "message": "Method scape performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4754%\n"
+        "message": "Method scape performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: scape\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4758%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score transformer_ensemble mean_rowwise_cosine", 
-        "value": 0.2245, 
+        "value": 0.2267, 
         "severity": 0, 
-        "severity_value": -0.2245, 
+        "severity_value": -0.2267, 
         "code": "worst_score >= -1", 
-        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.2245%\n"
+        "message": "Method transformer_ensemble performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.2267%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score transformer_ensemble mean_rowwise_cosine", 
-        "value": 0.2245, 
+        "value": 0.2267, 
         "severity": 0, 
-        "severity_value": 0.11225, 
+        "severity_value": 0.11335, 
         "code": "best_score <= 2", 
-        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_cosine\n  Best score: 0.2245%\n"
+        "message": "Method transformer_ensemble performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: transformer_ensemble\n  Metric id: mean_rowwise_cosine\n  Best score: 0.2267%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Worst score pyboost mean_rowwise_cosine", 
-        "value": 0.4651, 
+        "value": 0.4638, 
         "severity": 0, 
-        "severity_value": -0.4651, 
+        "severity_value": -0.4638, 
         "code": "worst_score >= -1", 
-        "message": "Method pyboost performs much worse than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4651%\n"
+        "message": "Method pyboost performs much worse than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_cosine\n  Worst score: 0.4638%\n"
     }, 
     {
-        "task_id": "perturbation_prediction", 
+        "task_id": "task_perturbation_prediction", 
         "category": "Scaling", 
         "name": "Best score pyboost mean_rowwise_cosine", 
-        "value": 0.4651, 
+        "value": 0.4638, 
         "severity": 0, 
-        "severity_value": 0.23255, 
+        "severity_value": 0.2319, 
         "code": "best_score <= 2", 
-        "message": "Method pyboost performs a lot better than baselines.\n  Task id: perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4651%\n"
+        "message": "Method pyboost performs a lot better than baselines.\n  Task id: task_perturbation_prediction\n  Method id: pyboost\n  Metric id: mean_rowwise_cosine\n  Best score: 0.4638%\n"
     }
 ]
\ No newline at end of file
diff --git a/results/perturbation_prediction/data/results.json b/results/perturbation_prediction/data/results.json
index ec40c6c8..b5fb6346 100644
--- a/results/perturbation_prediction/data/results.json
+++ b/results/perturbation_prediction/data/results.json
@@ -20,64 +20,61 @@
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 6.5,
-      "cpu_pct": 336.7,
-      "peak_memory_mb": 6144,
+      "duration_sec": 9,
+      "cpu_pct": 206.8,
+      "peak_memory_mb": 3482,
       "disk_read_mb": 135,
       "disk_write_mb": 6
-    },
-    "task_id": "perturbation_prediction"
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
     "method_id": "jn_ap_op2",
     "metric_values": {
-      "mean_rowwise_cosine": 0.3291,
-      "mean_rowwise_mae": 0.6493,
+      "mean_rowwise_cosine": 0.3289,
+      "mean_rowwise_mae": 0.6518,
       "mean_rowwise_pearson": 0.3267,
-      "mean_rowwise_rmse": 0.8939,
-      "mean_rowwise_spearman": 0.3061
+      "mean_rowwise_rmse": 0.8965,
+      "mean_rowwise_spearman": 0.3054
     },
     "scaled_scores": {
-      "mean_rowwise_cosine": 0.3291,
-      "mean_rowwise_mae": 0.3258,
+      "mean_rowwise_cosine": 0.3289,
+      "mean_rowwise_mae": 0.3233,
       "mean_rowwise_pearson": 0.3267,
-      "mean_rowwise_rmse": 0.3442,
-      "mean_rowwise_spearman": 0.3061
+      "mean_rowwise_rmse": 0.3425,
+      "mean_rowwise_spearman": 0.3054
     },
-    "mean_score": 0.3264,
+    "mean_score": 0.3254,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 3028,
-      "cpu_pct": 99.6,
-      "peak_memory_mb": 14439,
-      "disk_read_mb": 225,
+      "duration_sec": 3061,
+      "cpu_pct": 100,
+      "peak_memory_mb": 16077,
+      "disk_read_mb": 228,
       "disk_write_mb": 4
-    },
-    "task_id": "perturbation_prediction"
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
     "method_id": "lgc_ensemble",
     "metric_values": {
-      "mean_rowwise_cosine": 0.4542,
-      "mean_rowwise_mae": 0.584,
-      "mean_rowwise_pearson": 0.4503,
-      "mean_rowwise_rmse": 0.8016,
-      "mean_rowwise_spearman": 0.4238
+      "mean_rowwise_cosine": 0.4552,
+      "mean_rowwise_mae": 0.5756,
+      "mean_rowwise_pearson": 0.4514,
+      "mean_rowwise_rmse": 0.7921,
+      "mean_rowwise_spearman": 0.4247
     },
     "scaled_scores": {
-      "mean_rowwise_cosine": 0.4542,
-      "mean_rowwise_mae": 0.3936,
-      "mean_rowwise_pearson": 0.4503,
-      "mean_rowwise_rmse": 0.4119,
-      "mean_rowwise_spearman": 0.4238
+      "mean_rowwise_cosine": 0.4552,
+      "mean_rowwise_mae": 0.4024,
+      "mean_rowwise_pearson": 0.4514,
+      "mean_rowwise_rmse": 0.4191,
+      "mean_rowwise_spearman": 0.4247
     },
-    "mean_score": 0.4268,
+    "mean_score": 0.4306,
     "normalization_id": null,
-    "resources": {},
-    "task_id": "perturbation_prediction"
+    "resources": {}
   },
   {
     "dataset_id": "neurips-2023-data",
@@ -91,22 +88,21 @@
     },
     "scaled_scores": {
       "mean_rowwise_cosine": 0.3017,
-      "mean_rowwise_mae": 0.3316,
+      "mean_rowwise_mae": 0.3318,
       "mean_rowwise_pearson": 0.2972,
-      "mean_rowwise_rmse": 0.3453,
+      "mean_rowwise_rmse": 0.3455,
       "mean_rowwise_spearman": 0.2806
     },
     "mean_score": 0.3113,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 7.2,
-      "cpu_pct": 177.2,
+      "duration_sec": 3,
+      "cpu_pct": 472.1,
       "peak_memory_mb": 5735,
-      "disk_read_mb": 192,
+      "disk_read_mb": 193,
       "disk_write_mb": 2
-    },
-    "task_id": "perturbation_prediction"
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
@@ -120,22 +116,21 @@
     },
     "scaled_scores": {
       "mean_rowwise_cosine": 0.2628,
-      "mean_rowwise_mae": 0.2754,
+      "mean_rowwise_mae": 0.2755,
       "mean_rowwise_pearson": 0.2594,
-      "mean_rowwise_rmse": 0.3083,
+      "mean_rowwise_rmse": 0.3086,
       "mean_rowwise_spearman": 0.2425
     },
-    "mean_score": 0.2697,
+    "mean_score": 0.2698,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 3.3,
-      "cpu_pct": 374.8,
-      "peak_memory_mb": 5837,
-      "disk_read_mb": 192,
+      "duration_sec": 6.4,
+      "cpu_pct": 213.1,
+      "peak_memory_mb": 5940,
+      "disk_read_mb": 193,
       "disk_write_mb": 4
-    },
-    "task_id": "perturbation_prediction"
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
@@ -149,167 +144,161 @@
     },
     "scaled_scores": {
       "mean_rowwise_cosine": 0.2264,
-      "mean_rowwise_mae": 0.34,
+      "mean_rowwise_mae": 0.3401,
       "mean_rowwise_pearson": 0.2198,
-      "mean_rowwise_rmse": 0.3403,
+      "mean_rowwise_rmse": 0.3405,
       "mean_rowwise_spearman": 0.2117
     },
     "mean_score": 0.2677,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 3,
-      "cpu_pct": 491.7,
-      "peak_memory_mb": 5735,
-      "disk_read_mb": 192,
+      "duration_sec": 5.3,
+      "cpu_pct": 265.6,
+      "peak_memory_mb": 5632,
+      "disk_read_mb": 193,
       "disk_write_mb": 1
-    },
-    "task_id": "perturbation_prediction"
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
     "method_id": "nn_retraining_with_pseudolabels",
     "metric_values": {
-      "mean_rowwise_cosine": 0.4932,
-      "mean_rowwise_mae": 0.5469,
-      "mean_rowwise_pearson": 0.49,
-      "mean_rowwise_rmse": 0.7568,
-      "mean_rowwise_spearman": 0.4617
+      "mean_rowwise_cosine": 0.4941,
+      "mean_rowwise_mae": 0.5464,
+      "mean_rowwise_pearson": 0.4909,
+      "mean_rowwise_rmse": 0.7562,
+      "mean_rowwise_spearman": 0.4627
     },
     "scaled_scores": {
-      "mean_rowwise_cosine": 0.4932,
-      "mean_rowwise_mae": 0.4321,
-      "mean_rowwise_pearson": 0.49,
-      "mean_rowwise_rmse": 0.4448,
-      "mean_rowwise_spearman": 0.4617
+      "mean_rowwise_cosine": 0.4941,
+      "mean_rowwise_mae": 0.4328,
+      "mean_rowwise_pearson": 0.4909,
+      "mean_rowwise_rmse": 0.4455,
+      "mean_rowwise_spearman": 0.4627
     },
-    "mean_score": 0.4644,
+    "mean_score": 0.4652,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 1701,
-      "cpu_pct": 638.2,
-      "peak_memory_mb": 36045,
-      "disk_read_mb": 234,
-      "disk_write_mb": 7
-    },
-    "task_id": "perturbation_prediction"
+      "duration_sec": 2546,
+      "cpu_pct": 1293.7,
+      "peak_memory_mb": 52736,
+      "disk_read_mb": 247,
+      "disk_write_mb": 8
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
     "method_id": "pyboost",
     "metric_values": {
-      "mean_rowwise_cosine": 0.4651,
-      "mean_rowwise_mae": 0.5603,
-      "mean_rowwise_pearson": 0.462,
-      "mean_rowwise_rmse": 0.7949,
-      "mean_rowwise_spearman": 0.4405
+      "mean_rowwise_cosine": 0.4638,
+      "mean_rowwise_mae": 0.5609,
+      "mean_rowwise_pearson": 0.4607,
+      "mean_rowwise_rmse": 0.7957,
+      "mean_rowwise_spearman": 0.439
     },
     "scaled_scores": {
-      "mean_rowwise_cosine": 0.4651,
-      "mean_rowwise_mae": 0.4182,
-      "mean_rowwise_pearson": 0.462,
-      "mean_rowwise_rmse": 0.4168,
-      "mean_rowwise_spearman": 0.4405
+      "mean_rowwise_cosine": 0.4638,
+      "mean_rowwise_mae": 0.4178,
+      "mean_rowwise_pearson": 0.4607,
+      "mean_rowwise_rmse": 0.4165,
+      "mean_rowwise_spearman": 0.439
     },
-    "mean_score": 0.4405,
+    "mean_score": 0.4395,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 165,
-      "cpu_pct": 114.8,
-      "peak_memory_mb": 15668,
-      "disk_read_mb": 250,
-      "disk_write_mb": 12
-    },
-    "task_id": "perturbation_prediction"
+      "duration_sec": 151,
+      "cpu_pct": 154.1,
+      "peak_memory_mb": 22221,
+      "disk_read_mb": 261,
+      "disk_write_mb": 13
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
     "method_id": "sample",
     "metric_values": {
-      "mean_rowwise_cosine": 0.0536,
-      "mean_rowwise_mae": 0.9631,
-      "mean_rowwise_pearson": 0.0514,
-      "mean_rowwise_rmse": 1.3631,
-      "mean_rowwise_spearman": 0.0548
+      "mean_rowwise_cosine": 0.0547,
+      "mean_rowwise_mae": 0.9633,
+      "mean_rowwise_pearson": 0.0524,
+      "mean_rowwise_rmse": 1.3636,
+      "mean_rowwise_spearman": 0.0562
     },
     "scaled_scores": {
-      "mean_rowwise_cosine": 0.0536,
+      "mean_rowwise_cosine": 0.0547,
       "mean_rowwise_mae": 0,
-      "mean_rowwise_pearson": 0.0514,
+      "mean_rowwise_pearson": 0.0524,
       "mean_rowwise_rmse": 0,
-      "mean_rowwise_spearman": 0.0548
+      "mean_rowwise_spearman": 0.0562
     },
-    "mean_score": 0.032,
+    "mean_score": 0.0326,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 7.2,
-      "cpu_pct": 324.7,
-      "peak_memory_mb": 6247,
-      "disk_read_mb": 204,
+      "duration_sec": 8.9,
+      "cpu_pct": 216.6,
+      "peak_memory_mb": 3584,
+      "disk_read_mb": 203,
       "disk_write_mb": 6
-    },
-    "task_id": "perturbation_prediction"
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
     "method_id": "scape",
     "metric_values": {
-      "mean_rowwise_cosine": 0.4754,
-      "mean_rowwise_mae": 0.5726,
-      "mean_rowwise_pearson": 0.4724,
-      "mean_rowwise_rmse": 0.7754,
-      "mean_rowwise_spearman": 0.4412
+      "mean_rowwise_cosine": 0.4758,
+      "mean_rowwise_mae": 0.5722,
+      "mean_rowwise_pearson": 0.4728,
+      "mean_rowwise_rmse": 0.7748,
+      "mean_rowwise_spearman": 0.4417
     },
     "scaled_scores": {
-      "mean_rowwise_cosine": 0.4754,
-      "mean_rowwise_mae": 0.4055,
-      "mean_rowwise_pearson": 0.4724,
-      "mean_rowwise_rmse": 0.4311,
-      "mean_rowwise_spearman": 0.4412
+      "mean_rowwise_cosine": 0.4758,
+      "mean_rowwise_mae": 0.406,
+      "mean_rowwise_pearson": 0.4728,
+      "mean_rowwise_rmse": 0.4318,
+      "mean_rowwise_spearman": 0.4417
     },
-    "mean_score": 0.4451,
+    "mean_score": 0.4456,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 10079,
-      "cpu_pct": 134.2,
-      "peak_memory_mb": 67482,
+      "duration_sec": 8392,
+      "cpu_pct": 737.4,
+      "peak_memory_mb": 44237,
       "disk_read_mb": 228,
-      "disk_write_mb": 24269
-    },
-    "task_id": "perturbation_prediction"
+      "disk_write_mb": 25088
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
     "method_id": "transformer_ensemble",
     "metric_values": {
-      "mean_rowwise_cosine": 0.2245,
-      "mean_rowwise_mae": 0.6277,
-      "mean_rowwise_pearson": 0.2189,
-      "mean_rowwise_rmse": 0.8974,
-      "mean_rowwise_spearman": 0.2142
+      "mean_rowwise_cosine": 0.2267,
+      "mean_rowwise_mae": 0.6339,
+      "mean_rowwise_pearson": 0.2212,
+      "mean_rowwise_rmse": 0.8988,
+      "mean_rowwise_spearman": 0.2164
     },
     "scaled_scores": {
-      "mean_rowwise_cosine": 0.2245,
-      "mean_rowwise_mae": 0.3483,
-      "mean_rowwise_pearson": 0.2189,
-      "mean_rowwise_rmse": 0.3417,
-      "mean_rowwise_spearman": 0.2142
+      "mean_rowwise_cosine": 0.2267,
+      "mean_rowwise_mae": 0.342,
+      "mean_rowwise_pearson": 0.2212,
+      "mean_rowwise_rmse": 0.3408,
+      "mean_rowwise_spearman": 0.2164
     },
-    "mean_score": 0.2695,
+    "mean_score": 0.2694,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 8095,
-      "cpu_pct": 139.4,
-      "peak_memory_mb": 14848,
-      "disk_read_mb": 224,
-      "disk_write_mb": 12
-    },
-    "task_id": "perturbation_prediction"
+      "duration_sec": 3933,
+      "cpu_pct": 216.4,
+      "peak_memory_mb": 18944,
+      "disk_read_mb": 227,
+      "disk_write_mb": 6
+    }
   },
   {
     "dataset_id": "neurips-2023-data",
@@ -323,21 +312,20 @@
     },
     "scaled_scores": {
       "mean_rowwise_cosine": 0,
-      "mean_rowwise_mae": 0.3406,
+      "mean_rowwise_mae": 0.3407,
       "mean_rowwise_pearson": 0,
-      "mean_rowwise_rmse": 0.3266,
+      "mean_rowwise_rmse": 0.3268,
       "mean_rowwise_spearman": 0
     },
-    "mean_score": 0.1334,
+    "mean_score": 0.1335,
     "normalization_id": null,
     "resources": {
       "exit_code": 0,
-      "duration_sec": 3.6,
-      "cpu_pct": 412,
-      "peak_memory_mb": 5735,
-      "disk_read_mb": 192,
+      "duration_sec": 5.8,
+      "cpu_pct": 234.9,
+      "peak_memory_mb": 5837,
+      "disk_read_mb": 193,
       "disk_write_mb": 1
-    },
-    "task_id": "perturbation_prediction"
+    }
   }
 ]
diff --git a/results/perturbation_prediction/data/task_info.json b/results/perturbation_prediction/data/task_info.json
index bfd0521e..343bd4ac 100644
--- a/results/perturbation_prediction/data/task_info.json
+++ b/results/perturbation_prediction/data/task_info.json
@@ -1,8 +1,90 @@
 {
-  "task_id": "perturbation_prediction",
+  "task_id": "task_perturbation_prediction",
   "commit_sha": null,
   "task_name": "Perturbation Prediction",
   "task_summary": "Predicting how small molecules change gene expression in different cell types.",
-  "task_description": "Human biology can be complex, in part due to the function and interplay of the body's\napproximately 37 trillion cells, which are organized into tissues, organs, and systems.\nHowever, recent advances in single-cell technologies have provided unparalleled insight\ninto the function of cells and tissues at the level of DNA, RNA, and proteins. Yet\nleveraging single-cell methods to develop medicines requires mapping causal links\nbetween chemical perturbations and the downstream impact on cell state. These experiments\nare costly and labor intensive, and not all cells and tissues are amenable to\nhigh-throughput transcriptomic screening. If data science could help accurately predict\nchemical perturbations in new cell types, it could accelerate and expand the development\nof new medicines.\n\nSeveral methods have been developed for drug perturbation prediction, most of which are\nvariations on the autoencoder architecture (Dr.VAE, scGEN, and ChemCPA). However, these\nmethods lack proper benchmarking datasets with diverse cell types to determine how well\nthey generalize. The largest available training dataset is the NIH-funded Connectivity\nMap (CMap), which comprises over 1.3M small molecule perturbation measurements. However,\nthe CMap includes observations of only 978 genes, less than 5% of all genes. Furthermore,\nthe CMap data is comprised almost entirely of measurements in cancer cell lines, which\nmay not accurately represent human biology.\n\n\nThis task aims to predict how small molecules change gene expression in different cell\ntypes. This task was a [Kaggle competition](https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/overview)\nas part of the [NeurIPS 2023 competition track](https://neurips.cc/virtual/2023/competition/66586).\n\nThe task is to predict the gene expression profile of a cell after a small molecule\nperturbation. For this competition, we designed and generated a novel single-cell\nperturbational dataset in human peripheral blood mononuclear cells (PBMCs). We\nselected 144 compounds from the Library of Integrated Network-Based Cellular Signatures\n(LINCS) Connectivity Map dataset ([PMID: 29195078](https://pubmed.ncbi.nlm.nih.gov/29195078/))\nand measured single-cell gene\nexpression profiles after 24 hours of treatment. The experiment was repeated in three\nhealthy human donors, and the compounds were selected based on diverse transcriptional\nsignatures observed in CD34+ hematopoietic stem cells (data not released). We performed\nthis experiment in human PBMCs because the cells are commercially available with\npre-obtained consent for public release and PBMCs are a primary, disease-relevant tissue\nthat contains multiple mature cell types (including T-cells, B-cells, myeloid cells,\nand NK cells) with established markers for annotation of cell types. To supplement this\ndataset, we also measured cells from each donor at baseline with joint scRNA and\nsingle-cell chromatin accessibility measurements using the 10x Multiome assay. We hope\nthat the addition of rich multi-omic data for each donor and cell type at baseline will\nhelp establish biological priors that explain the susceptibility of particular genes to\nexhibit perturbation responses in difference biological contexts.\n",
-  "repo": "openproblems-bio/openproblems-v2"
+  "task_description": "Human biology can be complex, in part due to the function and interplay of the body's\napproximately 37 trillion cells, which are organized into tissues, organs, and systems.\nHowever, recent advances in single-cell technologies have provided unparalleled insight\ninto the function of cells and tissues at the level of DNA, RNA, and proteins. Yet\nleveraging single-cell methods to develop medicines requires mapping causal links\nbetween chemical perturbations and the downstream impact on cell state. These experiments\nare costly and labor intensive, and not all cells and tissues are amenable to\nhigh-throughput transcriptomic screening. If data science could help accurately predict\nchemical perturbations in new cell types, it could accelerate and expand the development\nof new medicines.\n\nSeveral methods have been developed for drug perturbation prediction, most of which are\nvariations on the autoencoder architecture (Dr.VAE, scGEN, and ChemCPA). However, these\nmethods lack proper benchmarking datasets with diverse cell types to determine how well\nthey generalize. The largest available training dataset is the NIH-funded Connectivity\nMap (CMap), which comprises over 1.3M small molecule perturbation measurements. However,\nthe CMap includes observations of only 978 genes, less than 5% of all genes. Furthermore,\nthe CMap data is comprised almost entirely of measurements in cancer cell lines, which\nmay not accurately represent human biology.\n\nThis task aims to predict how small molecules change gene expression in different cell\ntypes. This task was a [Kaggle competition](https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/overview)\nas part of the [NeurIPS 2023 competition track](https://neurips.cc/virtual/2023/competition/66586).\n\nThe task is to predict the gene expression profile of a cell after a small molecule\nperturbation. For this competition, we designed and generated a novel single-cell\nperturbational dataset in human peripheral blood mononuclear cells (PBMCs). We\nselected 144 compounds from the Library of Integrated Network-Based Cellular Signatures\n(LINCS) Connectivity Map dataset ([PMID: 29195078](https://pubmed.ncbi.nlm.nih.gov/29195078/))\nand measured single-cell gene\nexpression profiles after 24 hours of treatment. The experiment was repeated in three\nhealthy human donors, and the compounds were selected based on diverse transcriptional\nsignatures observed in CD34+ hematopoietic stem cells (data not released). We performed\nthis experiment in human PBMCs because the cells are commercially available with\npre-obtained consent for public release and PBMCs are a primary, disease-relevant tissue\nthat contains multiple mature cell types (including T-cells, B-cells, myeloid cells,\nand NK cells) with established markers for annotation of cell types. To supplement this\ndataset, we also measured cells from each donor at baseline with joint scRNA and\nsingle-cell chromatin accessibility measurements using the 10x Multiome assay. We hope\nthat the addition of rich multi-omic data for each donor and cell type at baseline will\nhelp establish biological priors that explain the susceptibility of particular genes to\nexhibit perturbation responses in difference biological contexts.\n",
+  "repo": "openproblems-bio/task_perturbation_prediction",
+  "authors": [
+    {
+      "name": "Artur Szałata",
+      "roles": "author",
+      "info": {
+        "github": "szalata",
+        "orcid": "000-0001-8413-234X"
+      }
+    },
+    {
+      "name": "Robrecht Cannoodt",
+      "roles": "author",
+      "info": {
+        "github": "rcannood",
+        "orcid": "0000-0003-3641-729X"
+      }
+    },
+    {
+      "name": "Daniel Burkhardt",
+      "roles": "author",
+      "info": {
+        "github": "dburkhardt",
+        "orcid": "0000-0001-7744-1363"
+      }
+    },
+    {
+      "name": "Malte D. Luecken",
+      "roles": "author",
+      "info": {
+        "github": "LuckyMD",
+        "orcid": "0000-0001-7464-7921"
+      }
+    },
+    {
+      "name": "Tin M. Tunjic",
+      "roles": "contributor",
+      "info": {
+        "github": "ttunja",
+        "orcid": "0000-0001-8842-6548"
+      }
+    },
+    {
+      "name": "Mengbo Wang",
+      "roles": "contributor",
+      "info": {
+        "github": "wangmengbo",
+        "orcid": "0000-0002-0266-9993"
+      }
+    },
+    {
+      "name": "Andrew Benz",
+      "roles": "author",
+      "info": {
+        "github": "andrew-benz",
+        "orcid": "0009-0002-8118-1861"
+      }
+    },
+    {
+      "name": "Tianyu Liu",
+      "roles": "contributor",
+      "info": {
+        "github": "HelloWorldLTY",
+        "orcid": "0000-0002-9412-6573"
+      }
+    },
+    {
+      "name": "Jalil Nourisa",
+      "roles": "contributor",
+      "info": {
+        "github": "janursa",
+        "orcid": "0000-0002-7539-4396"
+      }
+    },
+    {
+      "name": "Rico Meinl",
+      "roles": "contributor",
+      "info": {
+        "github": "ricomnl",
+        "orcid": "0000-0003-4356-6058"
+      }
+    }
+  ]
 }