diff --git a/batch_run.sh b/batch_run.sh index 2f21c876d..1b2cc8ecb 100644 --- a/batch_run.sh +++ b/batch_run.sh @@ -6,7 +6,10 @@ # sbatch --job-name=genie3_donor0_hvg scripts/sbatch/single_omics.sh scenic genie3 # sbatch --job-name=ppcor_donor0_hvg scripts/sbatch/single_omics_R.sh ppcor ppcor -sbatch --job-name=tigress_donor0_hvg scripts/sbatch/single_omics_R.sh tigress tigress # sbatch --job-name=scglue_donor0_hvg scripts/sbatch/scglue.sh scglue scglue + + + +# sbatch --job-name=tigress_donor0_hvg scripts/sbatch/single_omics_R.sh tigress tigress diff --git a/runs.ipynb b/runs.ipynb index 9a27f1c23..26833540d 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -5,6 +5,38 @@ "metadata": {}, "source": [] }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "# collectRI = pd.read_csv(\"https://github.com/pablormier/omnipath-static/raw/main/op/collectri-26.09.2023.zip\")\n", + "# collectRI.to_csv(f'resources/grn_models/d0_hvgs/collectri.csv')\n", + "# collectRI.to_csv(f'resources/grn_models/default/collectri.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sync" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "!aws s3 sync resources/grn-benchmark s3://openproblems-data/resources/grn/grn-benchmark --delete\n", + "!aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n", + "!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n", + "!aws s3 sync resources/supplementary/ s3://openproblems-data/resources/grn/supplementary --delete\n", + "!aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -14,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -69,117 +101,6 @@ " return df_all" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Add to somewhere" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# process multiomics data pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# add high coverage and highly variable\n", - "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n", - "def antoine_filtering(adata):\n", - " threshold = 0.1\n", - " mask = adata.X!=0\n", - " mask_obs = (np.sum(mask, axis=1).A.flatten()/mask.shape[1])>threshold\n", - " mask_var = (np.sum(mask, axis=0).A.flatten()/mask.shape[0])>threshold\n", - " adata.obs['high_coverage'] = mask_obs\n", - " adata.var['high_coverage'] = mask_var\n", - "antoine_filtering(adata)\n", - "\n", - "# raw hvgs\n", - "var = sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=7000, inplace=False)\n", - "adata.var['highly_variable'] = var.highly_variable\n", - "adata.write('resources/grn-benchmark/multiomics_rna.h5ad')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# subset to donor 0\n", - "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n", - "adata[adata.obs.donor_id=='donor_0', :].write('resources/grn-benchmark/multiomics_rna_d0.h5ad')" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "# subset to hvgs\n", - "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n", - "adata[:, adata.var.highly_variable].write('resources/grn-benchmark/multiomics_rna_hvg.h5ad')\n", - "\n", - "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n", - "adata[adata.obs.donor_id=='donor_0', adata.var.highly_variable].write('resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## binarize grn" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "\n", - "# Define base directory and subfolder to save binarized files\n", - "base_dir = 'resources/grn_models/d0_hvgs'\n", - "output_dir = os.path.join(base_dir, 'binarized')\n", - "\n", - "# Create the subfolder if it doesn't exist\n", - "os.makedirs(output_dir, exist_ok=True)\n", - "\n", - "# Define the binarization function\n", - "def binarize_weight(weight):\n", - " if weight > 0:\n", - " return 1\n", - " elif weight < 0:\n", - " return -1\n", - " else:\n", - " return 0\n", - "\n", - "# Iterate through all files in the base directory\n", - "for filename in os.listdir(base_dir):\n", - " # Check if the file is a CSV (or the appropriate file extension)\n", - " if filename.endswith('.csv'):\n", - " file_path = os.path.join(base_dir, filename)\n", - " \n", - " # Read the file into a DataFrame\n", - " df = pd.read_csv(file_path, index_col=0)\n", - " \n", - " # Apply binarization to the 'weight' column\n", - " df['weight'] = df['weight'].apply(binarize_weight)\n", - " \n", - " # Save the modified DataFrame to the subfolder\n", - " output_file_path = os.path.join(output_dir, filename)\n", - " df.to_csv(output_file_path, index=False)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -319,478 +240,263 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Runs\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Baseline experiment" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Baseline scores on donor 0. Normalized, non cell type specific" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "# RUN_ID=\"donor_0_normalized_noncelltype\" \n", - "# models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n", - "# df_all = process_data(RUN_ID, models_all)\n", - "# df_all.style.background_gradient()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Baseline scores on donor 0. Not normalized, non cell type specific" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "# RUN_ID=\"donor_0_notnormalized_noncelltype\"\n", - "# models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n", - "# df_all = process_data(RUN_ID, models_all)\n", - "# df_all.style.background_gradient()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Baseline scores on donor 0. notnormalized, cell type specific" + "# d0_hvgs" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ - "# RUN_ID=\"donor_0_notnormalized_celltype\"\n", - "# models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n", - "# df_all = process_data(RUN_ID, models_all)\n", - "# df_all.style.background_gradient()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Baseline scores on donor 0. Normalized, non cell type specific, hvgs" + "methods = [ 'pearson_corr', 'pearson_causal', 'positive_control', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/state.yaml to resources/results/donor_0_normalized_noncelltype_hvg/state.yaml\n", - "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/trace.txt to resources/results/donor_0_normalized_noncelltype_hvg/trace.txt\n", - "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/scores.yaml to resources/results/donor_0_normalized_noncelltype_hvg/scores.yaml\n", - "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_causal.pearson_causal.prediction.csv to resources/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_causal.pearson_causal.prediction.csv\n", - "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_corr.pearson_corr.prediction.csv to resources/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_corr.pearson_corr.prediction.csv\n", - "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/ridge.positive_control.positive_control.prediction.csv to resources/results/donor_0_normalized_noncelltype_hvg/ridge.positive_control.positive_control.prediction.csv\n" + "download: s3://openproblems-data/resources/grn/results/d0_hvgs/scores.yaml to resources/results/d0_hvgs/scores.yaml\n", + "download: s3://openproblems-data/resources/grn/results/d0_hvgs/trace.txt to resources/results/d0_hvgs/trace.txt\n", + "download: s3://openproblems-data/resources/grn/results/d0_hvgs/metric_configs.yaml to resources/results/d0_hvgs/metric_configs.yaml\n", + "download: s3://openproblems-data/resources/grn/results/d0_hvgs/state.yaml to resources/results/d0_hvgs/state.yaml\n" ] }, { "data": { "text/html": [ "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5
pearson_corr0.2396200.5182170.5295020.524232
pearson_causal0.3646560.5924570.7413280.560490
positive_control0.4925630.6815680.6554070.574608
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "RUN_ID=\"donor_0_normalized_noncelltype_hvg\" \n", - "models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n", - "df_all = process_data(RUN_ID, models_all)\n", - "df_all.style.background_gradient()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## individual runs" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
 ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5
celloracle0.2547190.3586230.6395560.580147
\n" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "RUN_ID=\"celloracle_d0_hvg\" \n", - "models_all = ['celloracle']\n", - "df_all = process_data(RUN_ID, models_all=None)\n", - "df_all.style.background_gradient()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Donor 1, HVGs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "methods = [ 'pearson_corr', 'pearson_causal', 'positive_control', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5ex(False)_tf(-1)ex(True)_tf(-1)static-theta-0.0static-theta-0.5
pearson_corr0.2396200.5182170.5295020.524232pearson_corr0.2386640.5146120.5295020.524232
pearson_causal0.3646560.5924570.7413280.560490pearson_causal0.3552560.5787530.7413280.560490
positive_control0.4925630.6815680.6554070.574608positive_control0.4891470.6771550.6554070.574608
portia0.0100700.0125560.4512560.518048portia0.1489410.2272480.4512560.518048
ppcor0.0090180.0069490.3966800.509874ppcor0.0228460.0941070.3966800.509874
genie30.1627380.2042690.7540730.576580genie30.3726410.4903570.7540730.576580
grnboost20.1283550.1704970.7818520.609075grnboost20.3810320.4598600.7818520.609075
scenic0.1377830.1490620.6008390.574294scenic0.1543990.2065710.6008390.574294
scglue0.0670170.2001350.4486170.527076scglue0.0783090.2388590.4486170.527076
celloracle0.2547190.3586230.6395560.580147celloracle0.2168970.3114510.6395560.580147
\n" ], "text/plain": [ - "" + "" ] }, - "execution_count": 11, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -805,37 +511,397 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Sync" + "# Format resourcs used" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 45, "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "upload: resources/grn_models/d0_hvgs/binarized/scglue.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/scglue.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/genie3.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/genie3.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/scenic.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/scenic.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/celloracle.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/celloracle.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/portia.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/portia.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/pearson_causal.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/pearson_causal.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/positive_control.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/positive_control.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/grnboost2.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/grnboost2.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/pearson_corr.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/pearson_corr.csv\n", - "upload: resources/grn_models/d0_hvgs/binarized/ppcor.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/ppcor.csv\n" + "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", + "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", + "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", + "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", + "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", + "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n", + " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n" ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
JobIDJobNameAllocCPUSElapsedStateMaxRSSMaxVMSize
portia7742241.bat+batch200.110556COMPLETED46.94349747.474384
grnboost27742249.bat+batch201.568056COMPLETED3.0674713.563801
scenic7742283.bat+batch201.908056COMPLETED30.35646132.573463
genie37742285.bat+batch2016.682500COMPLETED13.10510313.563530
ppcor7742364.bat+batch200.556667COMPLETED3.9091194.283978
scglue7742343.bat+batch204.380278FAILED29.91742335.933720
\n", + "
" + ], + "text/plain": [ + " JobID JobName AllocCPUS Elapsed State MaxRSS \\\n", + "portia 7742241.bat+ batch 20 0.110556 COMPLETED 46.943497 \n", + "grnboost2 7742249.bat+ batch 20 1.568056 COMPLETED 3.067471 \n", + "scenic 7742283.bat+ batch 20 1.908056 COMPLETED 30.356461 \n", + "genie3 7742285.bat+ batch 20 16.682500 COMPLETED 13.105103 \n", + "ppcor 7742364.bat+ batch 20 0.556667 COMPLETED 3.909119 \n", + "scglue 7742343.bat+ batch 20 4.380278 FAILED 29.917423 \n", + "\n", + " MaxVMSize \n", + "portia 47.474384 \n", + "grnboost2 3.563801 \n", + "scenic 32.573463 \n", + "genie3 13.563530 \n", + "ppcor 4.283978 \n", + "scglue 35.933720 " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "!aws s3 sync resources/grn-benchmark s3://openproblems-data/resources/grn/grn-benchmark --delete\n", - "!aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n", - "!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n", - "!aws s3 sync resources/supplementary/ s3://openproblems-data/resources/grn/supplementary --delete\n", - "!aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete" + "# extract trace for local jobs\n", + "job_ids = {\n", + " 'portia': 7742241,\n", + " 'grnboost2': 7742249,\n", + " 'scenic': 7742283,\n", + " 'genie3': 7742285,\n", + " 'ppcor': 7742364,\n", + " 'scglue': 7742343,\n", + "}\n", + "import subprocess\n", + "import pandas as pd\n", + "import io\n", + "\n", + "def get_sacct_data(job_id):\n", + " command = f'sacct -j {job_id} --format=JobID,JobName,AllocCPUS,Elapsed,State,MaxRSS,MaxVMSize'\n", + " output = subprocess.check_output(command, shell=True).decode()\n", + " \n", + " # Load the output into a DataFrame\n", + " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n", + " df = df.iloc[[2]]\n", + " return df\n", + "\n", + "for i, (name, job_id) in enumerate(job_ids.items()):\n", + " df = get_sacct_data(job_id)\n", + " df.index = [name]\n", + " if i==0:\n", + " df_local = df\n", + " else:\n", + " df_local = pd.concat([df_local, df], axis=0)\n", + "def elapsed_to_hours(elapsed_str):\n", + " h, m, s = map(int, elapsed_str.split(':'))\n", + " return h + m / 60 + s / 3600\n", + "# Remove 'K' and convert to integers\n", + "df_local['MaxRSS'] = df_local['MaxRSS'].str.replace('K', '').astype(int)\n", + "df_local['MaxVMSize'] = df_local['MaxVMSize'].str.replace('K', '').astype(int)\n", + "df_local['Elapsed'] = df_local['Elapsed'].apply(lambda x: (elapsed_to_hours(x)))\n", + "\n", + "\n", + "# Convert MaxRSS and MaxVMSize from KB to GB\n", + "df_local['MaxRSS'] = df_local['MaxRSS'] / (1024 ** 2) # Convert KB to GB\n", + "df_local['MaxVMSize'] = df_local['MaxVMSize'] / (1024 ** 2) # Convert KB to GB\n", + "df_local" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
%cpupeak_rssrcharwcharduration
celloracle799.614.918.086.11.472222
\n", + "
" + ], + "text/plain": [ + " %cpu peak_rss rchar wchar duration\n", + "celloracle 799.6 14.9 18.0 86.1 1.472222" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "# sequra runs\n", + "base_dir = 'resources/results/d0_hvgs_res/'\n", + "models = ['celloracle']\n", + "\n", + "def convert_duration_to_hours(duration_str):\n", + " import re\n", + " hours, minutes, seconds = 0, 0, 0\n", + " time_parts = re.findall(r'(\\d+)([hms])', duration_str)\n", + " for value, unit in time_parts:\n", + " if unit == 'h':\n", + " hours = int(value)\n", + " elif unit == 'm':\n", + " minutes = int(value)\n", + " elif unit == 's':\n", + " seconds = int(value)\n", + " return (hours * 3600 + minutes * 60 + seconds)/3600\n", + "def format_gb(gb_str):\n", + " gb_value = float(gb_str.split()[0])\n", + " return gb_value \n", + "\n", + "\n", + "for i, method in enumerate(models):\n", + " df = pd.read_csv(f'{base_dir}/{method}.txt', sep='\\t')[cols]\n", + " df = df.drop(1)\n", + "\n", + "\n", + " for col in cols:\n", + " if col=='%cpu':\n", + " df[col] = df[col].str.replace('%', '').astype(float)\n", + " elif col=='duration':\n", + " df[col] = df[col].apply(convert_duration_to_hours)\n", + " else:\n", + " df[col] = df[col].apply(format_gb)\n", + " df = df.sort_values(by='duration', ascending=False).iloc[[0]]\n", + " \n", + " df.index = [method]\n", + " if i==0:\n", + " df_sequera = df \n", + " else:\n", + " df_sequera = pd.concat([df_sequera, df], axis=0)\n", + "df_sequera" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Peak memoryDuration
celloracle14.9000001.472222
portia46.9434970.110556
grnboost23.0674711.568056
scenic30.3564611.908056
genie313.10510316.682500
ppcor3.9091190.556667
scglue29.9174234.380278
\n", + "
" + ], + "text/plain": [ + " Peak memory Duration\n", + "celloracle 14.900000 1.472222\n", + "portia 46.943497 0.110556\n", + "grnboost2 3.067471 1.568056\n", + "scenic 30.356461 1.908056\n", + "genie3 13.105103 16.682500\n", + "ppcor 3.909119 0.556667\n", + "scglue 29.917423 4.380278" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# merge local and cluster dfs \n", + "map_dict = {'peak_rss':'MaxRSS', 'duration':'Elapsed'}\n", + "\n", + "df_local = df_local[map_dict.values()]\n", + "df_sequera = df_sequera[map_dict.keys()]\n", + "\n", + "df_sequera.columns = df_sequera.columns.map(map_dict)\n", + "\n", + "\n", + "df_res = pd.concat([df_sequera, df_local], axis=0)\n", + "\n", + "df_res.columns = ['Peak memory (GB)', 'Duration (hour)']\n", + "\n", + "df_res" ] }, { diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh index 20c060b21..878afe48c 100644 --- a/scripts/run_grn_evaluation.sh +++ b/scripts/run_grn_evaluation.sh @@ -1,13 +1,11 @@ #!/bin/bash -# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)" -# viash ns build --parallel - -RUN_ID="d0_hvgs_binarized" +RUN_ID="d0_hvgs" +echo $RUN_ID resources_dir="s3://openproblems-data/resources/grn" # resources_dir="./resources" publish_dir="${resources_dir}/results/${RUN_ID}" -grn_models_folder="${resources_dir}/grn_models/d0_hvgs/binarized" +grn_models_folder="${resources_dir}/grn_models/d0_hvgs" reg_type="ridge" subsample=-2 @@ -41,11 +39,13 @@ grn_names=( "ppcor" "scenic" "portia" - + + "negative_control" "positive_control" "pearson_causal" "pearson_corr" - + + "collectri" ) # Start writing to the YAML file diff --git a/src/control_methods/negative_control/script.py b/src/control_methods/negative_control/script.py index fce2276d6..2c62a4f5f 100644 --- a/src/control_methods/negative_control/script.py +++ b/src/control_methods/negative_control/script.py @@ -6,16 +6,17 @@ ## VIASH START par = { "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", - "prediction": "output/negative_control.csv", - "tf_all": "resources/prior/tf_all.csv" + "prediction": "resources/grn_models/default/negative_control.csv", + "tf_all": "resources/prior/tf_all.csv", + "max_n_links": 50000 } ## VIASH END print(par) def process_links(net, par): net = net[net.source!=net.target] - net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index) - net = net_sorted.head(par['max_n_links']).reset_index(drop=True) + net = net.sample(par['max_n_links']) + print(net) return net print('Reading input data') @@ -23,7 +24,7 @@ def process_links(net, par): gene_names = perturbation_data.var_names.to_numpy() tf_all = np.loadtxt(par['tf_all'], dtype=str) -n_tf = 1200 +n_tf = 500 tfs = tf_all[:n_tf] def create_negative_control(gene_names) -> np.ndarray: diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py index 97ef66ae6..6a1da25d4 100644 --- a/src/metrics/regression_1/main.py +++ b/src/metrics/regression_1/main.py @@ -148,7 +148,8 @@ def regression_1( if (len(net_sub)>par['max_n_links']) and (par['max_n_links']!=-1): net_sub = process_links(net_sub, par) #only top links are considered verbose_print(par['verbose'], f"Number of links reduced to {par['max_n_links']}", 2) - + if par['binarize']: + net['weight'] = net['weight'].apply(binarize_weight) prturb_adata_sub = prturb_adata[prturb_adata.obs.cell_type==cell_type,:] y_true_sub, y_pred_sub = cross_validation(net_sub, prturb_adata_sub, par) @@ -210,8 +211,7 @@ def main(par): net = pd.read_csv(par['prediction']) # net['weight'] = net.weight.abs() # subset to keep only those links with source as tf - if par['binarize']: - net['weight'] = net['weight'].apply(binarize_weight) + if par['apply_tf']: net = net[net.source.isin(tf_all)] # if 'cell_type' in net.columns: diff --git a/src/process_data/multiomics/format_data/config.vsh.yaml b/src/process_data/multiomics/format_data/config.vsh.yaml index c07cda267..220dfe537 100644 --- a/src/process_data/multiomics/format_data/config.vsh.yaml +++ b/src/process_data/multiomics/format_data/config.vsh.yaml @@ -10,17 +10,29 @@ functionality: type: file required: false direction: input - default: resources/datasets_raw/multiome_counts.h5ad + example: resources/datasets_raw/multiome_counts.h5ad - name: --multiomics_rna type: file required: false direction: output - default: resources/grn-benchmark/multiomics_rna.h5ad + example: resources/grn-benchmark/multiomics_rna.h5ad + - name: --multiomics_rna_d0 + type: file + required: false + direction: output + example: resources/grn-benchmark/multiomics_rna_d0.h5ad + + - name: --multiomics_rna_d0_hvg + type: file + required: false + direction: output + example: resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad + - name: --multiomics_atac type: file required: false direction: output - default: resources/grn-benchmark/multiomics_atac.h5ad + example: resources/grn-benchmark/multiomics_atac.h5ad resources: - type: python_script path: script.py diff --git a/src/process_data/multiomics/format_data/script.py b/src/process_data/multiomics/format_data/script.py index 98ebe7802..de8a2af35 100644 --- a/src/process_data/multiomics/format_data/script.py +++ b/src/process_data/multiomics/format_data/script.py @@ -1,4 +1,5 @@ import anndata as ad +import scanpy as sc ## VIASH START par = { # 'multiome_counts': 'resources/datasets_raw/multiome_counts.h5ad', @@ -25,7 +26,23 @@ multiomics_rna = multiomics[:,multiomics.var.feature_types=='Gene Expression'] multiomics_rna.var = multiomics_rna.var[['gene_ids', 'interval']] -# ATAC +def high_coverage(adata): + threshold = 0.1 + mask = adata.X!=0 + mask_obs = (np.sum(mask, axis=1).A.flatten()/mask.shape[1])>threshold + mask_var = (np.sum(mask, axis=0).A.flatten()/mask.shape[0])>threshold + adata.obs['high_coverage'] = mask_obs + adata.var['high_coverage'] = mask_var +high_coverage(multiomics_rna) + +# hvgs +var = sc.pp.highly_variable_genes(multiomics_rna, flavor='seurat_v3', n_top_genes=7000, inplace=False) +multiomics_rna.var['highly_variable'] = var.highly_variable + +# subset to donor 0 +multiomics_rna_d0 = multiomics_rna[multiomics_rna.obs.donor_id=='donor_0', :] +multiomics_rna_d0_hvg = multiomics_rna[multiomics_rna.obs.donor_id=='donor_0', multiomics_rna.var.highly_variable] +#------ ATAC multiomics_atac = multiomics[:,multiomics.var.feature_types=='Peaks'] multiomics_atac.var = multiomics_atac.var[[]] @@ -45,4 +62,6 @@ multiomics_atac.obs['donor_id'] = multiomics_atac.obs['donor_id'].map(donor_map) multiomics_rna.write(par['multiomics_rna']) +multiomics_rna_h0.write(par['multiomics_rna_h0']) +multiomics_rna_h0_hvg.write(par['multiomics_rna_h0_hvg']) multiomics_atac.write(par['multiomics_atac']) \ No newline at end of file