update results

openproblems-bio · Oct 31, 2024 · 2ed4898 · 2ed4898
1 parent d9adbb5
commit 2ed4898
Show file tree

Hide file tree

Showing 7 changed files with 859 additions and 780 deletions.
diff --git a/results/perturbation_prediction/data/dataset_info.json b/results/perturbation_prediction/data/dataset_info.json
@@ -1,13 +1,12 @@
 [
   {
-    "task_id": "perturbation_prediction",
     "dataset_id": "neurips-2023-data",
     "dataset_name": "NeurIPS2023 scPerturb DGE",
     "dataset_summary": "Differential gene expression sign(logFC) * -log10(p-value) values after 24 hours of treatment with 144 compounds in human PBMCs",
     "dataset_description": "For this competition, we designed and generated a novel single-cell perturbational dataset in human peripheral blood mononuclear cells (PBMCs). We selected 144 compounds from the Library of Integrated Network-Based Cellular Signatures (LINCS) Connectivity Map dataset (PMID: 29195078) and measured single-cell gene expression profiles after 24 hours of treatment. The experiment was repeated in three healthy human donors, and the compounds were selected based on diverse transcriptional signatures observed in CD34+ hematopoietic stem cells (data not released). We performed this experiment in human PBMCs because the cells are commercially available with pre-obtained consent for public release and PBMCs are a primary, disease-relevant tissue that contains multiple mature cell types (including T-cells, B-cells, myeloid cells, and NK cells) with established markers for annotation of cell types. To supplement this dataset, we also measured cells from each donor at baseline with joint scRNA and single-cell chromatin accessibility measurements using the 10x Multiome assay. We hope that the addition of rich multi-omic data for each donor and cell type at baseline will help establish biological priors that explain the susceptibility of particular genes to exhibit perturbation responses in difference biological contexts.",
-    "data_reference": "TBD",
-    "data_url": "TBD",
-    "date_created": "02-06-2024",
-    "file_size": 183170148
+    "data_reference": "@article{slazata2024benchmark,\n\ttitle = {A benchmark for prediction of transcriptomic responses to chemical perturbations across cell types},\n\tauthor = {Artur Szałata and Andrew Benz and Robrecht Cannoodt and Mauricio Cortes and Jason Fong and Sunil Kuppasani and Richard Lieberman and Tianyu Liu and Javier A. Mas-Rosario and Rico Meinl and Jalil Nourisa and Jared Tumiel and Tin M. Tunjic and Mengbo Wang and Noah Weber and Hongyu Zhao and Benedict Anchang and Fabian J Theis and Malte D Luecken and Daniel B Burkhardt},\n\tbooktitle = {The Thirty-eight Conference on Neural Information Processing Systems Datasets and Benchmarks Track},\n\tyear = {2024},\n\turl = {https://openreview.net/forum?id=WTI4RJYSVm}\n}",
+    "data_url": "https://trace.ncbi.nlm.nih.gov/Traces/?view=study&acc=SRP527159",
+    "date_created": "31-10-2024",
+    "file_size": 183168750
   }
 ]
diff --git a/results/perturbation_prediction/data/method_info.json b/results/perturbation_prediction/data/method_info.json
@@ -8,9 +8,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/ground_truth/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/ground_truth/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -21,9 +21,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/mean_outcome/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/mean_outcome/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -34,9 +34,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/mean_across_celltypes/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/mean_across_celltypes/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -47,9 +47,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/mean_across_compounds/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/mean_across_compounds/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -60,9 +60,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/sample/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/sample/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "control_methods",
@@ -73,9 +73,9 @@
     "is_baseline": true,
     "paper_reference": null,
     "code_url": null,
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/control_methods/zeros/config.vsh.yaml",
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/control_methods/zeros/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -85,10 +85,10 @@
     "method_description": "An ensemble of LSTM, GRU, and 1D CNN models with a variety of input features derived from ChemBERTa embeddings,\none-hot encoding of cell type/small molecule pairs, and various statistical measures of target gene expression.\nThe models were trained with a combination of MSE, MAE, LogCosh, and BCE loss functions to improve their\nrobustness and predictive performance. The approach also included data augmentation techniques to ensure\ngeneralization and account for noise in the data.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/Jean-KOUAGOU/1st-place-solution-single-cell-pbs/tree/main",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/lgc_ensemble/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/lgc_ensemble/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -98,23 +98,23 @@
     "method_description": "The prediction system is two staged, so I publish two versions of the notebook.\nThe first stage predicts pseudolabels. To be honest, if I stopped on this version, I would not be the third.\nThe predicted pseudolabels on all test data (255 rows) are added to training in the second stage.\n\n**Stage 1 preparing pseudolabels**: The main part of this system is a neural network. Every neural network and its environment was optimized by optuna. Hyperparameters that have been optimized:\na dropout value, a number of neurons in particular layers, an output dimension of an embedding layer, a number of epochs, a learning rate, a batch size, a number of dimension of truncated singular value decomposition.\nThe optimization was done on custom 4-folds cross validation. In order to avoid overfitting to cross validation by optuna I applied 2 repeats for every fold and took an average. Generally, the more, the better. The optuna's criterion was MRRMSE.\nFinally, 7 models were ensembled. Optuna was applied again to determine best weights of linear combination. The prediction of test set is the pseudolabels now and will be used in second stage.\n\n**Stage 2 retraining with pseudolabels**: The pseudolabels (255 rows) were added to the training dataset. I applied 20 models with optimized parameters in different experiments for a model diversity.\nOptuna selected optimal weights for the linear combination of the prediction again.\nModels had high variance, so every model was trained 10 times on all dataset and the median of prediction is taken as a final prediction. The prediction was additionally clipped to colwise min and max. \n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/okon2000/single_cell_perturbations",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/nn_retraining_with_pseudolabels/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/nn_retraining_with_pseudolabels/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
     "method_id": "jn_ap_op2",
     "method_name": "JN-AP-OP2",
     "method_summary": "Deep learning architecture composed of 2 modules: a sample-centric MLP and a gene-centric MLP",
-    "method_description": "We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode,\nwhere n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data.\nThe purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it\nto MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene)\ncombination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples. \n",
+    "method_description": "We first encode each sample using leave-one-out encoder based on compound and cell type. This produces X with the dimension of n_samples, n_genes, n_encode,\nwhere n_encode is 2. Then, X is passed to a MLP1 sample-wise with input of n_samples, n_genes*n_encode, which outputs the same dimension data.\nThe purpose of this MLP is to learn inter-gene relationships. Then, we group the output of MLP1 with X (original encoded data) and feed it\nto MLP2 which receives n_smaples*n_genes, (n_encode + n_encode) and results n_samples*n_genes. This MLP2 trains on each (compound, cell_type, gene)\ncombination. This is to overcome the underdetermination problem due to lack of sufficient (compound, cell_type) samples.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/AntoinePassemiers/Open-Challenges-Single-Cell-Perturbations",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/jn_ap_op2/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/jn_ap_op2/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -123,11 +123,11 @@
     "method_summary": "Neural network model for drug effect prediction",
     "method_description": "ScAPE is utilises a neural network (NN) model to estimate drug effects on gene expression in\nperipheral blood mononuclear cells (PBMCs). The model took drug and cell features as input,\nwith these features primarily derived from the median of signed log-pvalues and log fold-changes\ngrouped by drug and cell type. The NN was trained using a leave-one-drug-out cross-validation\nstrategy, focusing on NK cells as a representative cell type due to their similarity to B cells\nand Myeloid cells in principal component analysis. Model performance was evaluated by comparing\nits predictions against two baselines: predicting zero effect and predicting the median\nlog-pvalue for each drug. The final submission combined predictions from models trained on\ndifferent gene and drug subsets, aiming to enhance overall prediction accuracy.\n",
     "is_baseline": false,
-    "paper_reference": "pablormier2023scape",
-    "code_url": "https://github.com/scapeML/scape",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/scape/config.vsh.yaml",
+    "paper_reference": null,
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/scape/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -137,10 +137,10 @@
     "method_description": "This method employs an ensemble of four transformer models,\neach with different weights and trained on slightly varying feature sets.\nThe feature engineering process involved one-hot encoding of categorical labels,\ntarget encoding using mean and standard deviation, and enriching the feature set\nwith the standard deviation of target variables. Additionally, the dataset was\ncarefully examined to ensure data cleanliness. A sophisticated sampling strategy\nbased on K-Means clustering was employed to partition the data into training and\nvalidation sets, ensuring a representative distribution. The model architecture\nleveraged sparse and dense feature encoding, along with a transformer for effective\nlearning.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/Eliorkalfon/single_cell_pb",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/transformer_ensemble/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/transformer_ensemble/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   },
   {
     "task_id": "methods",
@@ -150,9 +150,9 @@
     "method_description": "An ensemble of four models was considered: \n\n* Py-boost (a ridge regression-based recommender system)\n* ExtraTrees (a decision tree ensemble with target-encoded features)\n* a k-nearest neighbors recommender system\n* a ridge regression model\n\nEach model offered distinct strengths and weaknesses: ExtraTrees and\nknn were unable to extrapolate beyond the training data, while ridge\nregression provided extrapolation capability. To enhance model performance,\ndata augmentation techniques were used, including averaging differential\nexpressions for compound mixtures and adjusting cell counts to reduce biases.\n\nIn the end, only the py-boost model is used for generating predictions.\n",
     "is_baseline": false,
     "paper_reference": null,
-    "code_url": "https://github.com/Ambros-M/Single-Cell-Perturbations-2023",
-    "implementation_url": "https://github.com/openproblems-bio/openproblems-v2/tree/a161cfd989c11df9949386a103110fac45734cad//home/runner/work/task-dge-perturbation-prediction/task-dge-perturbation-prediction/src/task/methods/pyboost/config.vsh.yaml",
+    "code_url": null,
+    "implementation_url": "https://github.com/openproblems-bio/task_perturbation_prediction/blob/2fa44462b1e7d530bad703c4a20ed22b49d3705e/src/methods/pyboost/config.vsh.yaml",
     "code_version": null,
-    "commit_sha": "a161cfd989c11df9949386a103110fac45734cad"
+    "commit_sha": "2fa44462b1e7d530bad703c4a20ed22b49d3705e"
   }
 ]