From d2539c6daf49e238123d0673ab6757cff6260cbc Mon Sep 17 00:00:00 2001 From: Jalil Nourisa Date: Mon, 23 Dec 2024 20:51:12 +0100 Subject: [PATCH] changes to permutations --- runs.ipynb | 1278 ++++------------- src/exp_analysis/helper.py | 2 +- src/helper.py | 151 +- .../single_omics/scgpt/config.vsh.yaml | 16 +- src/metrics/regression_1/main.py | 1 + src/metrics/script_all.py | 36 +- .../peak_annotation/script.R | 4 +- src/robustness_analysis/permute_grn/main.py | 13 + src/robustness_analysis/script_all.py | 67 +- 9 files changed, 402 insertions(+), 1166 deletions(-) diff --git a/runs.ipynb b/runs.ipynb index fcacc7c24..c9ef24d64 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -14,14 +14,32 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# !aws s3 sync resources/ s3://openproblems-data/resources/grn/ --delete\n", "# !aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n", - "# !aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n", - "# !aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete" + "!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n", + "# !aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete\n", + "# !aws s3 sync resources/scores/ s3://openproblems-data/resources/grn/scores --delete\n", + "# !aws s3 sync resources/evaluation_datasets/ s3://openproblems-data/resources/grn/evaluation_datasets/ --delete\n", + "# !aws s3 sync resources/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "aws s3 sync s3://openproblems-data/resources/grn/results resources/results/ --delete\n", + "\n", + "aws s3 sync s3://openproblems-data/resources/grn/grn_models resources/grn_models/\n", + "\n", + "aws s3 sync s3://openproblems-data/resources/grn/inference_datasets/ resources/inference_datasets/\n", + "\n", + "aws s3 sync s3://openproblems-data/resources/grn/evaluation_datasets/ resources/evaluation_datasets/" ] }, { @@ -31,7 +49,7 @@ "outputs": [], "source": [ "# !aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n", - "!aws s3 sync resources/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" + "# !aws s3 sync resources/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" ] }, { @@ -79,498 +97,164 @@ "datasets = ['op', 'replogle2', 'nakatake', 'norman', 'adamson']" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Prior " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "if False: \n", - " create_skeleton() # create tf2gene putative links" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Run grn inference " - ] - }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "negative_control\n", - "Job negative_control submitted successfully.\n", - "{'rna': 'resources/inference_datasets/norman_rna.h5ad', 'prediction': 'resources/grn_models/norman//negative_control.csv', 'tf_all': 'resources/prior/tf_all.csv', 'max_n_links': 50000, 'num_workers': '10'}\n", - "Reading input data\n", - "Inferring GRN\n", - "\n" + "hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather\n", + "hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings.feather\n", + "motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl\n" ] } ], "source": [ - "if True: # local runs\n", - " run_grn_inference()" + "!ls output/scenic/databases/" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 3, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['B cells', 'Myeloid cells', 'T cells', 'NK cells']\n", + "Categories (4, object): ['B cells', 'Myeloid cells', 'NK cells', 'T cells']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Calculate scores" + "ad.read_h5ad('resources/inference_datasets/op_rna.h5ad').obs['cell_type'].unique()" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 12, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Submitted batch job 7852664\n" - ] - } - ], + "outputs": [], "source": [ - "from src.helper import calculate_scores\n", - "if False: # consensus: run this after updating grns\n", - " run_consensus(par)\n", - "\n", - "if True: # run metrics/script_all.py\n", - " calculate_scores()" + "# df.gene_name.nunique()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 6, "metadata": {}, + "outputs": [], "source": [ - "# Regression scores" + "# adata = ad.read('resources/inference_datasets/adamson_rna.h5ad')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### OPSCA" + "# Prior " ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50000-skeleton_False-binarize_True-GB.csv\n", - "50000-skeleton_False-binarize_True-ridge.csv\n", - "lognorm-50000-skeleton_False-binarize_True-ridge-global-False.csv\n", - "nets\n", - "X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\n" - ] - } - ], + "outputs": [], "source": [ - "!ls resources/scores/op/" + "if False: \n", + " create_skeleton() # create tf2gene putative links" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### Ridge" + "# Run grn inference " ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 S1S2static-theta-0.0static-theta-0.5static-theta-1.0rank
collectri0.0583150.1268990.2297790.2724090.29076911
negative_control-0.000923-0.0009610.1936400.2606670.29087712
positive_control0.7217501.1888030.6542970.4116750.3101242
pearson_corr0.5703331.0483550.5805830.3798910.3035944
portia0.4655270.6740970.5138850.3172060.2980957
ppcor0.1966890.2384860.3621900.2897430.29119010
grnboost20.7355400.9230760.5812650.4714910.3299401
scenic0.3192720.4738050.5191030.4065250.3174896
granie0.1475220.1939770.1654000.2114970.28323913
scglue0.1556420.6789080.4992500.2938630.2939398
celloracle0.4661190.7624380.5778430.4124850.3094745
figr0.2312060.5361450.2886750.3012670.2956269
scenicplus0.5885780.7537050.6121030.4672260.3247253
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "if False: # local runs\n", + " run_grn_inference()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculate scores" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "from src.helper import calculate_scores\n", + "if False: # consensus: run this after updating grns\n", + " run_consensus(par)\n", + "\n", + "if False: # run metrics/script_all.py\n", + " calculate_scores()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Regression scores" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### OPSCA" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Ridge" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'resources/scores/op/X_norm-50000-skeleton_False-binarize_True-ridge-global_False.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m df_scores \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mresources/scores/op/X_norm-50000-skeleton_False-binarize_True-ridge-global_False.csv\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindex_col\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# df_scores[df_scores<0] = 0\u001b[39;00m\n\u001b[1;32m 3\u001b[0m df_all_n \u001b[38;5;241m=\u001b[39m (df_scores\u001b[38;5;241m-\u001b[39mdf_scores\u001b[38;5;241m.\u001b[39mmin(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m))\u001b[38;5;241m/\u001b[39m(df_scores\u001b[38;5;241m.\u001b[39mmax(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\u001b[38;5;241m-\u001b[39mdf_scores\u001b[38;5;241m.\u001b[39mmin(axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m))\n", + "File \u001b[0;32m~/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:912\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 899\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 900\u001b[0m dialect,\n\u001b[1;32m 901\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 908\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 909\u001b[0m )\n\u001b[1;32m 910\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m--> 912\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:577\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 574\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 576\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 577\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 579\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 580\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1407\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1404\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1406\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1407\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1661\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1659\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1660\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1661\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1662\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1663\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1664\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1665\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1666\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1667\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1668\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1669\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1670\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1671\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1672\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/miniconda3/envs/py10/lib/python3.10/site-packages/pandas/io/common.py:859\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 854\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 855\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 856\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 857\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 858\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 859\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 860\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 861\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 862\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 863\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 864\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 865\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 866\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 867\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 868\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'resources/scores/op/X_norm-50000-skeleton_False-binarize_True-ridge-global_False.csv'" + ] } ], "source": [ - "df_scores = pd.read_csv(f\"resources/scores/op/50000-skeleton_False-binarize_True-ridge.csv\", index_col=0)\n", + "df_scores = pd.read_csv(f\"resources/scores/op/X_norm-50000-skeleton_False-binarize_True-ridge-global_False.csv\", index_col=0)\n", "# df_scores[df_scores<0] = 0\n", "df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", "df_scores['rank'] = df_all_n.mean(axis=1).rank(ascending=False).astype(int)\n", @@ -2525,595 +2209,6 @@ "df_scores.style.background_gradient()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Format resourcs used" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "if True: \n", - " # job_ids_dict_hvg = { \n", - " # 'portia': 7744548,\n", - " # 'grnboost2': 7742249,\n", - " # 'scenic': 7742283,\n", - " # 'genie3': 7742285,\n", - " # 'ppcor': 7742364,\n", - " # 'scglue': 7742343,\n", - " # }\n", - "\n", - " job_ids_dict = { \n", - " 'portia': 7751292,\n", - " 'grnboost2': 7747906,\n", - " 'scenic': 7748219,\n", - " 'ppcor': 7748321,\n", - " 'scglue': [7756286, 7756675],\n", - " 'scenicplus': [7761874, 7760439, 7760554],\n", - " 'figr': 7756664,\n", - " 'celloracle': 7761872,\n", - " 'pearson_corr': 7770044,\n", - " 'positive_control': 7770047,\n", - " 'negative_control':7770048 \n", - " }\n", - " \n", - " df_res = process_trace_local(job_ids_dict)\n", - " df_res = df_res[['Elapsed', 'MaxVMSize']]\n", - " granie = pd.DataFrame({'Elapsed': 3643.337/60/60, 'MaxVMSize': 41},index=['granie'])\n", - " df_res = pd.concat([df_res, granie], axis=0)\n", - " df_res.columns = ['Duration (hour)', 'Peak memory (GB)']\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Merge scores with resources" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [], - "source": [ - "# - collect all the scores\n", - "for i, dataset in enumerate(datasets):\n", - " df_scores = pd.read_csv(f\"resources/scores/{dataset}/X_norm-50000-skeleton_False-binarize_True-ridge-global-False.csv\", index_col=0)\n", - " # - normalize scores \n", - " df_scores = df_scores.fillna(0)\n", - " df_scores[df_scores < 0] = 0\n", - " df_scores = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))\n", - " df_scores = df_scores.reset_index().melt(id_vars='index', var_name='metric', value_name='r2score').rename(columns={'index':'model'})\n", - " df_scores['dataset'] = dataset\n", - " if i == 0:\n", - " df_all = df_scores\n", - " else:\n", - " df_all = pd.concat([df_all, df_scores], axis=0)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [], - "source": [ - "df_all = df_all[~(df_all['model'] == 'collectri')]" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/vol/tmp/users/jnourisa/ipykernel_1636782/208795827.py:9: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", - " df_metrics = df_all.groupby(['model','metric']).apply(lambda df: mean_for_metrics(df)).reset_index().pivot(index='model', columns='metric', values='r2score')\n", - "/vol/tmp/users/jnourisa/ipykernel_1636782/208795827.py:19: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n", - " df_datasets = df_all.groupby(['model','dataset']).apply(lambda df: mean_for_datasets(df)).reset_index().pivot(index='model', columns='dataset', values='r2score')\n" - ] - } - ], - "source": [ - "# - mean scores for metrics\n", - "def mean_for_metrics(df):\n", - " metric = df['metric'].values.flatten()[0]\n", - " if metric in ['S1','S2']:\n", - " df = df[df['dataset']=='op']\n", - " else:\n", - " pass \n", - " return df[['r2score']].mean()\n", - "df_metrics = df_all.groupby(['model','metric']).apply(lambda df: mean_for_metrics(df)).reset_index().pivot(index='model', columns='metric', values='r2score')\n", - "\n", - "# - mean scores for datasets\n", - "def mean_for_datasets(df):\n", - " dataset = df['dataset'].values.flatten()[0]\n", - " if dataset != 'op':\n", - " df = df[~df['metric'].isin(['S1','S2'])]\n", - " else:\n", - " pass \n", - " return df[['r2score']].mean()\n", - "df_datasets = df_all.groupby(['model','dataset']).apply(lambda df: mean_for_datasets(df)).reset_index().pivot(index='model', columns='dataset', values='r2score')" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
method_nameS1S2static-theta-0.0static-theta-0.5static-theta-1.0adamsonnakatakenormanopreplogle2overall_scoreDuration (hour)Peak memory (GB)User-friendlyComplexity
3GRNBoost21.0000000.7764750.8264320.9809361.0000001.0000000.7875850.9411550.9254191.0000000.9238007.5105567.37879682
7Positive Control0.9812521.0000000.9583460.8608570.6335130.8326830.8577640.8864580.8653770.7290770.8605330.01500011.601669100
10Scenic+0.8001990.6340040.9136970.9835950.8883490.0000000.0000000.0000000.8439690.0000000.84396911.740556131.34285419
5Pearson corr.0.7753940.8818580.8806610.7921170.5567920.8147960.7294210.8349920.7180030.6924850.7676520.04138923.801899100
0CellOracle0.6337100.6413490.8436190.7730480.5617660.0000000.0000000.0000000.6906990.0000000.6906993.76500041.60116664
6Portia0.6329050.5670380.5236100.3370810.2357000.0000000.5119190.2662770.5274880.5699570.4171982.49111155.68523091
11scGLUE0.2116030.5710860.6828640.3167990.2291250.0000000.0000000.0000000.4022950.0000000.40229511.09750061.67787964
9Scenic0.4335220.3928390.4378520.3504870.3450240.0000000.2891970.2409450.6066710.2453430.37132024.00861135.95430073
1FigR0.3059380.4495420.2521490.3452780.2652390.0000000.0000000.0000000.3236290.0000000.3236296.731667225.20872564
8PPCOR0.2674080.2006100.3600930.2851750.1657670.5660480.0418810.4186850.2683510.0338650.26078813.42583364.13643373
4Negative Control0.0000000.0000000.0731810.1396940.0807630.3525810.0000000.0000000.0820890.0000000.0728310.0038892.216045100
2GRaNIE0.2005630.1631700.0000000.0000000.0000000.0000000.0000000.0000000.0727470.0000000.0727471.01203841.00000064
\n", - "
" - ], - "text/plain": [ - " method_name S1 S2 static-theta-0.0 static-theta-0.5 \\\n", - "3 GRNBoost2 1.000000 0.776475 0.826432 0.980936 \n", - "7 Positive Control 0.981252 1.000000 0.958346 0.860857 \n", - "10 Scenic+ 0.800199 0.634004 0.913697 0.983595 \n", - "5 Pearson corr. 0.775394 0.881858 0.880661 0.792117 \n", - "0 CellOracle 0.633710 0.641349 0.843619 0.773048 \n", - "6 Portia 0.632905 0.567038 0.523610 0.337081 \n", - "11 scGLUE 0.211603 0.571086 0.682864 0.316799 \n", - "9 Scenic 0.433522 0.392839 0.437852 0.350487 \n", - "1 FigR 0.305938 0.449542 0.252149 0.345278 \n", - "8 PPCOR 0.267408 0.200610 0.360093 0.285175 \n", - "4 Negative Control 0.000000 0.000000 0.073181 0.139694 \n", - "2 GRaNIE 0.200563 0.163170 0.000000 0.000000 \n", - "\n", - " static-theta-1.0 adamson nakatake norman op replogle2 \\\n", - "3 1.000000 1.000000 0.787585 0.941155 0.925419 1.000000 \n", - "7 0.633513 0.832683 0.857764 0.886458 0.865377 0.729077 \n", - "10 0.888349 0.000000 0.000000 0.000000 0.843969 0.000000 \n", - "5 0.556792 0.814796 0.729421 0.834992 0.718003 0.692485 \n", - "0 0.561766 0.000000 0.000000 0.000000 0.690699 0.000000 \n", - "6 0.235700 0.000000 0.511919 0.266277 0.527488 0.569957 \n", - "11 0.229125 0.000000 0.000000 0.000000 0.402295 0.000000 \n", - "9 0.345024 0.000000 0.289197 0.240945 0.606671 0.245343 \n", - "1 0.265239 0.000000 0.000000 0.000000 0.323629 0.000000 \n", - "8 0.165767 0.566048 0.041881 0.418685 0.268351 0.033865 \n", - "4 0.080763 0.352581 0.000000 0.000000 0.082089 0.000000 \n", - "2 0.000000 0.000000 0.000000 0.000000 0.072747 0.000000 \n", - "\n", - " overall_score Duration (hour) Peak memory (GB) User-friendly \\\n", - "3 0.923800 7.510556 7.378796 8 \n", - "7 0.860533 0.015000 11.601669 10 \n", - "10 0.843969 11.740556 131.342854 1 \n", - "5 0.767652 0.041389 23.801899 10 \n", - "0 0.690699 3.765000 41.601166 6 \n", - "6 0.417198 2.491111 55.685230 9 \n", - "11 0.402295 11.097500 61.677879 6 \n", - "9 0.371320 24.008611 35.954300 7 \n", - "1 0.323629 6.731667 225.208725 6 \n", - "8 0.260788 13.425833 64.136433 7 \n", - "4 0.072831 0.003889 2.216045 10 \n", - "2 0.072747 1.012038 41.000000 6 \n", - "\n", - " Complexity \n", - "3 2 \n", - "7 0 \n", - "10 9 \n", - "5 0 \n", - "0 4 \n", - "6 1 \n", - "11 4 \n", - "9 3 \n", - "1 4 \n", - "8 3 \n", - "4 0 \n", - "2 4 " - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# - calculate over scores\n", - "df_scores = pd.concat([df_metrics, df_datasets], axis=1)\n", - "# df_scores = df_metrics\n", - "df_scores['overall_score'] = df_scores.mean(axis=1)\n", - "\n", - "# - merge scores with resources \n", - "df_summary = pd.concat([df_scores, df_res], axis=1)\n", - "df_summary = df_summary.fillna(0)\n", - "df_summary.index.name = 'method_name' \n", - "df_summary = df_summary.reset_index()\n", - "\n", - "df_summary = df_summary.sort_values(by='overall_score', ascending=False) \n", - "\n", - "df_summary.method_name = df_summary.method_name.map(surragate_names)\n", - "# - add user complexity \n", - "df_summary['User-friendly'] = df_summary['method_name'].map({\n", - " 'Scenic+': 1, \n", - " 'GRNBoost2': 8, \n", - " 'Positive Control': 10, \n", - " 'Pearson corr.': 10,\n", - " 'CellOracle': 6,\n", - " 'Portia': 9,\n", - " 'scGLUE': 6,\n", - " 'Scenic': 7,\n", - " 'FigR': 6,\n", - " 'PPCOR': 7,\n", - " 'Negative Control': 10,\n", - " 'GRaNIE': 6,\n", - " })\n", - "df_summary['Complexity'] = df_summary['User-friendly'].max() - df_summary['User-friendly']\n", - "df_summary" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Summary figure" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "── \u001b[1mAttaching packages\u001b[22m ─────────────────────────────────────── tidyverse 1.3.1 ──\n", - "\u001b[32m✔\u001b[39m \u001b[34mggplot2\u001b[39m 3.5.1 \u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 1.0.2\n", - "\u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.2.1 \u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.1.4\n", - "\u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.3.1 \u001b[32m✔\u001b[39m \u001b[34mstringr\u001b[39m 1.5.1\n", - "\u001b[32m✔\u001b[39m \u001b[34mreadr \u001b[39m 2.1.2 \u001b[32m✔\u001b[39m \u001b[34mforcats\u001b[39m 0.5.1\n", - "── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────── tidyverse_conflicts() ──\n", - "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n", - "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n", - "\u001b[?25h\u001b[?25h\u001b[?25hWarning message:\n", - "\u001b[1m\u001b[22m`thisfile()` was deprecated in rprojroot 2.0.0.\n", - "\u001b[36mℹ\u001b[39m Please use `whereami::thisfile()` instead. \n", - "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[1m\u001b[22mNew names:\n", - "\u001b[36m•\u001b[39m `` -> `...1`\n", - "\u001b[1mRows: \u001b[22m\u001b[34m12\u001b[39m \u001b[1mColumns: \u001b[22m\u001b[34m22\u001b[39m\n", - "\u001b[36m──\u001b[39m \u001b[1mColumn specification\u001b[22m \u001b[36m────────────────────────────────────────────────────────\u001b[39m\n", - "\u001b[1mDelimiter:\u001b[22m \"\\t\"\n", - "\u001b[31mchr\u001b[39m (1): method_name\n", - "\u001b[32mdbl\u001b[39m (21): ...1, S1, S2, static-theta-0.0, static-theta-0.5, static-theta-1.0...\n", - "\n", - "\u001b[36mℹ\u001b[39m Use `spec()` to retrieve the full column specification for this data.\n", - "\u001b[36mℹ\u001b[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.\n", - "\u001b[?25h\u001b[?25h\u001b[90m# A tibble: 18 × 7\u001b[39m\n", - " id id_color name group geom palette options \n", - " \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \u001b[3m\u001b[90m\u001b[39m\u001b[23m \n", - "\u001b[90m 1\u001b[39m method_name \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mName\u001b[90m\"\u001b[39m meth… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", - "\u001b[90m 2\u001b[39m overall_score overall_score \u001b[90m\"\u001b[39mScore\u001b[90m\"\u001b[39m over… bar overall \u001b[90m\u001b[39m\n", - "\u001b[90m 3\u001b[39m S1 S1 \u001b[90m\"\u001b[39mS1\u001b[90m\"\u001b[39m metr… funk… metric… \u001b[90m\u001b[39m\n", - "\u001b[90m 4\u001b[39m S2 S2 \u001b[90m\"\u001b[39mS2\u001b[90m\"\u001b[39m metr… funk… metric… \u001b[90m\u001b[39m\n", - "\u001b[90m 5\u001b[39m static-theta-0.0 static-theta-0.0 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", - "\u001b[90m 6\u001b[39m static-theta-0.5 static-theta-0.5 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", - "\u001b[90m 7\u001b[39m static-theta-1.0 static-theta-1.0 \u001b[90m\"\u001b[39mTheta (m… metr… funk… metric… \u001b[90m\u001b[39m\n", - "\u001b[90m 8\u001b[39m op op \u001b[90m\"\u001b[39mOPSCA\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", - "\u001b[90m 9\u001b[39m adamson adamson \u001b[90m\"\u001b[39mAdamson\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", - "\u001b[90m10\u001b[39m nakatake nakatake \u001b[90m\"\u001b[39mNakatake\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", - "\u001b[90m11\u001b[39m norman norman \u001b[90m\"\u001b[39mNorman\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", - "\u001b[90m12\u001b[39m replogle2 replogle2 \u001b[90m\"\u001b[39mReplogle\u001b[90m\"\u001b[39m data… funk… dataset \u001b[90m\u001b[39m\n", - "\u001b[90m13\u001b[39m memory_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mPeak mem… reso… rect resour… \u001b[90m\u001b[39m\n", - "\u001b[90m14\u001b[39m memory_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", - "\u001b[90m15\u001b[39m duration_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mDuration… reso… rect resour… \u001b[90m\u001b[39m\n", - "\u001b[90m16\u001b[39m duration_str \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", - "\u001b[90m17\u001b[39m complexity_log \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39mComplexi… reso… rect resour… \u001b[90m\u001b[39m\n", - "\u001b[90m18\u001b[39m Complexity \u001b[31mNA\u001b[39m \u001b[90m\"\u001b[39m\u001b[90m\"\u001b[39m reso… text \u001b[31mNA\u001b[39m \u001b[90m\u001b[39m\n", - "\u001b[?25h\u001b[?25h\u001b[?25h\u001b[?25h\u001b[36mℹ\u001b[39m Could not find column 'id' in data. Using rownames as 'id'.\n", - "\u001b[36mℹ\u001b[39m Column info did not contain a column called 'legend', generating options based on the 'geom' column.\n", - "\u001b[36mℹ\u001b[39m No row info was provided, assuming all rows in `data` are to be plotted.\n", - "\u001b[36mℹ\u001b[39m Row info did not contain group information, assuming rows are ungrouped.\n", - "\u001b[36mℹ\u001b[39m Palette named 'dataset' was not defined. Assuming palette is numerical. Automatically selected palette 'Blues'.\n", - "\u001b[36mℹ\u001b[39m Some palettes were not used in the column info, adding legends for them.\n", - "\u001b[36mℹ\u001b[39m Legend 1 did not contain color, inferring from the palette.\n", - "\u001b[36mℹ\u001b[39m Legend 2 did not contain color, inferring from the palette.\n", - "\u001b[36mℹ\u001b[39m Legend 6 did not contain a geom, inferring from the column info.\n", - "\u001b[36mℹ\u001b[39m Legend 6 did not contain labels, inferring from the geom.\n", - "\u001b[36mℹ\u001b[39m Legend 6 did not contain size, inferring from the labels.\n", - "\u001b[36mℹ\u001b[39m Legend 6 did not contain color, inferring from the palette.\n", - "\u001b[?25h\u001b[?25h\u001b[?25h" - ] - } - ], - "source": [ - "\n", - "summary_file = \"output/summary.tsv\"\n", - "summary_figure = \"output/summary_figure.pdf\"\n", - "\n", - "df_summary['memory_log'] = np.log(df_summary['Peak memory (GB)']+1)\n", - "df_summary['memory_log'] = np.max(df_summary['memory_log'])-df_summary['memory_log']\n", - "\n", - "df_summary['complexity_log'] = np.log(df_summary['Complexity']+1)\n", - "df_summary['complexity_log'] = np.max(df_summary['complexity_log'])-df_summary['complexity_log']\n", - "\n", - "df_summary[\"duration_log\"] = np.log(df_summary['Duration (hour)']+1)\n", - "df_summary['duration_log'] = np.max(df_summary['duration_log'])-df_summary['duration_log']\n", - "\n", - "df_summary[\"duration_str\"] = df_summary['Duration (hour)'].round(1).astype(str)\n", - "df_summary['memory_str'] = df_summary['Peak memory (GB)'].round(1).astype(str)\n", - "\n", - "df_summary.to_csv(summary_file, sep='\\t')\n", - "\n", - "!Rscript ../grn_benchmark/src/summary_figure.R {summary_file} {summary_figure}" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -4186,13 +3281,122 @@ "outputs": [], "source": [ "from src.helper import analyse_imputation\n", - "analyse_imputation(task_grn_inference_dir='./')\n" + "analyse_imputation(task_grn_inference_dir='./')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Causal versus correlations" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "convert to long dataframe\n", + "TF subsetting\n", + "(456, 25090)\n", + "----cross validate for default----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing groups: 100%|██████████| 5/5 [00:12<00:00, 2.41s/it]\n", + "GRN preprocessing: 25090it [00:00, 25373.48it/s]\n", + "ridge CV: 100%|██████████| 25090/25090 [00:02<00:00, 11765.42it/s]\n", + "GRN preprocessing: 25090it [00:00, 25339.89it/s]\n", + "ridge CV: 100%|██████████| 25090/25090 [01:55<00:00, 217.99it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Static approach (theta=1):\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "GRN preprocessing: 25090it [00:00, 25640.46it/s]\n", + "ridge CV: 100%|██████████| 25090/25090 [06:10<00:00, 67.69it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "convert to long dataframe\n", + "(456, 25090)\n", + "----cross validate for default----\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing groups: 100%|██████████| 5/5 [00:02<00:00, 1.86it/s]\n", + "GRN preprocessing: 25090it [00:11, 2215.38it/s]\n", + "ridge CV: 100%|██████████| 25090/25090 [00:02<00:00, 9352.85it/s] \n", + "GRN preprocessing: 25090it [00:11, 2211.47it/s]\n", + "ridge CV: 100%|██████████| 25090/25090 [02:10<00:00, 191.63it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Static approach (theta=1):\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "GRN preprocessing: 25090it [00:11, 2197.01it/s]\n", + "ridge CV: 21%|██ | 5145/25090 [01:17<05:00, 66.47it/s]" + ] + } + ], + "source": [ + "from src.helper import analyse_causal\n", + "analyse_causal(task_grn_inference_dir='./')\n" ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['ZNF354C', 'KLF12', 'ZNF143', ..., 'ZNF831', 'ZRSR2', 'ZSWIM1'],\n", + " dtype=' 1: - day = int(time[0]) - time = time[1] - else: - day = 0 - time = time[0] - h, m, s = map(int, time.split(':')) - return day*24 + h + m / 60 + s / 3600 - def reformat_data(df_local): - # Remove 'K' and convert to integers - df_local['MaxRSS'] = df_local['MaxRSS'].str.replace('K', '').astype(int) - df_local['MaxVMSize'] = df_local['MaxVMSize'].str.replace('K', '').astype(int) - df_local['Elapsed'] = df_local['Elapsed'].apply(lambda x: (elapsed_to_hours(x))) - - # Convert MaxRSS and MaxVMSize from KB to GB - df_local['MaxRSS'] = df_local['MaxRSS'] / (1024 ** 2) # Convert KB to GB - df_local['MaxVMSize'] = df_local['MaxVMSize'] / (1024 ** 2) # Convert KB to GB - return df_local - for i, (name, job_id) in enumerate(job_ids_dict.items()): - if type(job_id)==list: - - for i_sub, job_id_ in enumerate(job_id): - df_ = get_sacct_data(job_id_) - df_ = reformat_data(df_) - if i_sub == 0: - df = df_ - else: - concat_df = pd.concat([df, df_], axis=0) - df['MaxVMSize'] = concat_df['MaxVMSize'].max() - df['MaxRSS'] = concat_df['MaxRSS'].max() - df['Elapsed'] = concat_df['Elapsed'].sum() - else: - df = get_sacct_data(job_id) - df = reformat_data(df) - df.index = [name] - if i==0: - df_local = df - else: - df_local = pd.concat([df_local, df], axis=0) - - - return df_local + if __name__ == '__main__': @@ -453,4 +493,5 @@ def reformat_data(df_local): # calculate_scores() # analyse_meta_cells(task_grn_inference_dir='./') analyse_imputation(task_grn_inference_dir='./') + # analyse_corr_vs_tfmasked_corr(task_grn_inference_dir='./') diff --git a/src/methods/single_omics/scgpt/config.vsh.yaml b/src/methods/single_omics/scgpt/config.vsh.yaml index e9766818d..d62285bdf 100644 --- a/src/methods/single_omics/scgpt/config.vsh.yaml +++ b/src/methods/single_omics/scgpt/config.vsh.yaml @@ -28,11 +28,23 @@ functionality: path: script.py platforms: + # - type: docker + # image: xueerchen/scgpt:0.1.7 + # setup: + # - type: python + # packages: [ gdown ] - type: docker - image: xueerchen/scgpt:0.1.7 + image: openproblems/base_pytorch_nvidia:1.0.0 + # TODO: Try to find working installation of flash attention (flash-attn<1.0.5) setup: - type: python - packages: [ gdown ] + pypi: + - gdown + - scgpt # Install from PyPI to get dependencies + - type: docker + # Force re-installing from GitHub to get bug fixes + run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git + - type: native - type: nextflow directives: diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index f1fac91ea..d4517f6e2 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -241,6 +241,7 @@ def set_global_seed(seed): def pivot_grn(net): ''' make net to have gene*tf format''' + net = net.drop_duplicates(subset=['target', 'source']) df_tmp = net.pivot(index='target', columns='source', values='weight') return df_tmp.fillna(0) diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py index f5410e967..8d0eed034 100644 --- a/src/metrics/script_all.py +++ b/src/metrics/script_all.py @@ -8,11 +8,29 @@ def define_par(dataset): par = { + # - run general models + 'run_global_models': False, 'reg_type': 'ridge', - 'models_dir': f"resources/grn_models/{dataset}", - 'scores_dir': f"resources/scores/{dataset}", - - 'models': [ 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], + # 'models_dir': f"resources/grn_models/{dataset}", + # 'scores_dir': f"resources/scores/{dataset}", + # 'models': [ 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], + + + 'models_dir': f"../ciim/output/grns/", + 'scores_dir': f"../ciim/output/scores/", + 'models': [ + 'pearson_corr', + 'grnboost2', 'celloracle', 'scenicplus', + 'net_all_celltypes_young_all_batches', + 'net_all_celltypes_young_batch_1', + 'net_all_celltypes_old_all_batches', + 'net_all_celltypes_old_batch_1', + 'net_B cells_all_ages_all_batches', + 'net_T cells_all_ages_all_batches', + 'net_Myeloid cells_all_ages_all_batches', + 'net_NK cells_all_ages_all_batches', + + ], 'global_models': [ 'collectri', @@ -63,16 +81,16 @@ def define_par(dataset): # - run consensus from consensus.script import main as main_consensus -# - run general models -global_models = False + # - run metrics -for dataset in ['op']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' +# for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' +for dataset in ['op']: print('------ ', dataset, '------') par = define_par(dataset) os.makedirs(par['scores_dir'], exist_ok=True) main_consensus(par) - if global_models: + if par['run_global_models']: par['models'] = par['global_models'] par['models_dir'] = par['global_models_dir'] for binarize in [False]: @@ -104,7 +122,7 @@ def define_par(dataset): df_all = score else: df_all = pd.concat([df_all, score]) - df_all.to_csv(f"{par['scores_dir']}/{par['layer']}-{max_n_links}-skeleton_{apply_skeleton}-binarize_{binarize}-{par['reg_type']}-global-{global_models}.csv") + df_all.to_csv(f"{par['scores_dir']}/{par['layer']}-{max_n_links}-skeleton_{apply_skeleton}-binarize_{binarize}-{par['reg_type']}-global-{par['run_global_models']}.csv") print(df_all) i+=1 diff --git a/src/process_data/explanatory_analysis/peak_annotation/script.R b/src/process_data/explanatory_analysis/peak_annotation/script.R index 708c63bad..9410ab3a2 100644 --- a/src/process_data/explanatory_analysis/peak_annotation/script.R +++ b/src/process_data/explanatory_analysis/peak_annotation/script.R @@ -11,8 +11,8 @@ library(tibble) ## VIASH START par <- list( - multiomics_atac = "resources/grn-benchmark/multiomics_atac.h5ad", - annot_peak_database = "resources/grn-benchmark/supp/annot_peak_database.csv" + multiomics_atac = "resources/inference_datasets/op_atac.h5ad", + annot_peak_database = "resources/prior/peak_annotation.csv" ) ## VIASH END diff --git a/src/robustness_analysis/permute_grn/main.py b/src/robustness_analysis/permute_grn/main.py index 0206c87cd..e453db5d8 100644 --- a/src/robustness_analysis/permute_grn/main.py +++ b/src/robustness_analysis/permute_grn/main.py @@ -60,6 +60,19 @@ def main(par): random_indices = np.random.choice(prediction.index, size=num_to_modify, replace=False) # 3. Change the sign of the selected rows prediction.loc[random_indices, 'weight'] *= -1 + elif type == 'direction': # change the regulatory sign + # Calculate the number of rows to permute + prediction = prediction.reset_index(drop=True) + n_rows_to_permute = int(len(prediction) * (degree)) + # print(n_rows_to_permute) + + # Randomly select indices to permute + indices_to_permute = np.random.choice(prediction.index, size=n_rows_to_permute, replace=False) + + print(indices_to_permute) + # Swap source and target for the selected rows + prediction.loc[indices_to_permute, ['source', 'target']] = prediction.loc[indices_to_permute, ['target', 'source']].values + elif type == 'binary': # change the regulatory sign prediction['weight'] = np.where(prediction['weight'] > 0, 1, -1) else: diff --git a/src/robustness_analysis/script_all.py b/src/robustness_analysis/script_all.py index dd2c49188..410344ffc 100644 --- a/src/robustness_analysis/script_all.py +++ b/src/robustness_analysis/script_all.py @@ -10,15 +10,17 @@ 'read_dir': "resources/grn_models/op/", 'write_dir': "resources/results/robustness_analysis", 'degrees': [0, 10, 20, 50, 100], + # 'degrees': [20, 50, 100], # 'degrees': [50], - # 'noise_types': ["net", "sign", 'weight'], - 'noise_types': ['weight'], + # 'analysis_types': ["net", "sign", 'weight'], + 'analysis_types': ['direction'], 'methods': ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], + # 'methods': ['pearson_corr'], "evaluation_data": "resources/evaluation_datasets/op_perturbation.h5ad", "tf_all": "resources/prior/tf_all.csv", "max_n_links": 50000, - "apply_tf": True, + "apply_tf": False, 'binarize': False, 'subsample': -1, 'verbose': 0, @@ -50,7 +52,7 @@ def run_reg(par): #------ noise types and degrees ------# if True: - for noise_type in par['noise_types']: # run for each noise type (net, sign, weight) + for noise_type in par['analysis_types']: # run for each noise type (net, sign, weight) for degree in par['degrees']: # run for each degree for i, method in enumerate(par['methods']): # run for each method par['prediction'] = f"{par['read_dir']}/{method}.csv" @@ -71,60 +73,5 @@ def run_reg(par): df_all = score else: df_all = pd.concat([df_all, score]) + print(noise_type, degree, df_all) df_all.to_csv(f"{par['write_dir']}/{noise_type}-{degree}-scores.csv") - print(df_all) - -#------ causal vs corr ------# -if False: - from util import create_corr_net - par = { - 'reg_type': 'ridge', - 'write_dir': "resources/results/robustness_analysis", - ## base corr - "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", - 'cell_type_specific': False, - 'normalize': False, - ## metric - 'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad', - "tf_all": "resources/prior/tf_all.csv", - "max_n_links": 50000, - "apply_tf": False, #this has to be false - 'subsample': -2, - 'verbose': 2, - 'binarize': True, - 'num_workers': 20, - 'consensus': 'resources/prior/consensus-num-regulators.json', - 'static_only': True, - 'clip_scores': True, - 'layer': 'scgen_pearson', - 'seed': 32 - } - - # run for corr - os.makedirs(f"{par['write_dir']}/corr/", exist_ok=True) - par['causal'] = False - for i in range(100): - par['causal'] - par['prediction'] = f"{par['write_dir']}/corr/corr.csv" - par['seed'] = i - random.seed(par['seed']) - print('seed :', par['seed']) - - net = create_corr_net(par) - net.to_csv(par['prediction']) - score = run_reg(par) - if i == 0: - scores_corr = score - else: - scores_corr = pd.concat([score, scores_corr], axis=0) - print(scores_corr) - scores_corr.to_csv(f"{par['write_dir']}/corr/scores_corr.csv") - - # run for causal corr - par['prediction'] = f"{par['write_dir']}/corr/corr_causal.csv" - par['causal'] = True - net = create_corr_net(par) - - net.to_csv(par['prediction']) - score = run_reg(par) - score.to_csv(f"{par['write_dir']}/corr/scores_causal.csv") \ No newline at end of file