diff --git a/batch_run.sh b/batch_run.sh
index 2f21c876d..1b2cc8ecb 100644
--- a/batch_run.sh
+++ b/batch_run.sh
@@ -6,7 +6,10 @@
# sbatch --job-name=genie3_donor0_hvg scripts/sbatch/single_omics.sh scenic genie3
# sbatch --job-name=ppcor_donor0_hvg scripts/sbatch/single_omics_R.sh ppcor ppcor
-sbatch --job-name=tigress_donor0_hvg scripts/sbatch/single_omics_R.sh tigress tigress
# sbatch --job-name=scglue_donor0_hvg scripts/sbatch/scglue.sh scglue scglue
+
+
+
+# sbatch --job-name=tigress_donor0_hvg scripts/sbatch/single_omics_R.sh tigress tigress
diff --git a/runs.ipynb b/runs.ipynb
index 9a27f1c23..26833540d 100644
--- a/runs.ipynb
+++ b/runs.ipynb
@@ -5,6 +5,38 @@
"metadata": {},
"source": []
},
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# collectRI = pd.read_csv(\"https://github.com/pablormier/omnipath-static/raw/main/op/collectri-26.09.2023.zip\")\n",
+ "# collectRI.to_csv(f'resources/grn_models/d0_hvgs/collectri.csv')\n",
+ "# collectRI.to_csv(f'resources/grn_models/default/collectri.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Sync"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "!aws s3 sync resources/grn-benchmark s3://openproblems-data/resources/grn/grn-benchmark --delete\n",
+ "!aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n",
+ "!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n",
+ "!aws s3 sync resources/supplementary/ s3://openproblems-data/resources/grn/supplementary --delete\n",
+ "!aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -14,7 +46,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -69,117 +101,6 @@
" return df_all"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Add to somewhere"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# process multiomics data pipeline"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "# add high coverage and highly variable\n",
- "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n",
- "def antoine_filtering(adata):\n",
- " threshold = 0.1\n",
- " mask = adata.X!=0\n",
- " mask_obs = (np.sum(mask, axis=1).A.flatten()/mask.shape[1])>threshold\n",
- " mask_var = (np.sum(mask, axis=0).A.flatten()/mask.shape[0])>threshold\n",
- " adata.obs['high_coverage'] = mask_obs\n",
- " adata.var['high_coverage'] = mask_var\n",
- "antoine_filtering(adata)\n",
- "\n",
- "# raw hvgs\n",
- "var = sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=7000, inplace=False)\n",
- "adata.var['highly_variable'] = var.highly_variable\n",
- "adata.write('resources/grn-benchmark/multiomics_rna.h5ad')\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "# subset to donor 0\n",
- "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n",
- "adata[adata.obs.donor_id=='donor_0', :].write('resources/grn-benchmark/multiomics_rna_d0.h5ad')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "# subset to hvgs\n",
- "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n",
- "adata[:, adata.var.highly_variable].write('resources/grn-benchmark/multiomics_rna_hvg.h5ad')\n",
- "\n",
- "adata = ad.read_h5ad('resources/grn-benchmark/multiomics_rna.h5ad')\n",
- "adata[adata.obs.donor_id=='donor_0', adata.var.highly_variable].write('resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## binarize grn"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "import os\n",
- "import pandas as pd\n",
- "\n",
- "# Define base directory and subfolder to save binarized files\n",
- "base_dir = 'resources/grn_models/d0_hvgs'\n",
- "output_dir = os.path.join(base_dir, 'binarized')\n",
- "\n",
- "# Create the subfolder if it doesn't exist\n",
- "os.makedirs(output_dir, exist_ok=True)\n",
- "\n",
- "# Define the binarization function\n",
- "def binarize_weight(weight):\n",
- " if weight > 0:\n",
- " return 1\n",
- " elif weight < 0:\n",
- " return -1\n",
- " else:\n",
- " return 0\n",
- "\n",
- "# Iterate through all files in the base directory\n",
- "for filename in os.listdir(base_dir):\n",
- " # Check if the file is a CSV (or the appropriate file extension)\n",
- " if filename.endswith('.csv'):\n",
- " file_path = os.path.join(base_dir, filename)\n",
- " \n",
- " # Read the file into a DataFrame\n",
- " df = pd.read_csv(file_path, index_col=0)\n",
- " \n",
- " # Apply binarization to the 'weight' column\n",
- " df['weight'] = df['weight'].apply(binarize_weight)\n",
- " \n",
- " # Save the modified DataFrame to the subfolder\n",
- " output_file_path = os.path.join(output_dir, filename)\n",
- " df.to_csv(output_file_path, index=False)"
- ]
- },
{
"cell_type": "markdown",
"metadata": {},
@@ -319,478 +240,263 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Runs\n"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Baseline experiment"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Baseline scores on donor 0. Normalized, non cell type specific"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "# RUN_ID=\"donor_0_normalized_noncelltype\" \n",
- "# models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n",
- "# df_all = process_data(RUN_ID, models_all)\n",
- "# df_all.style.background_gradient()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Baseline scores on donor 0. Not normalized, non cell type specific"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [],
- "source": [
- "# RUN_ID=\"donor_0_notnormalized_noncelltype\"\n",
- "# models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n",
- "# df_all = process_data(RUN_ID, models_all)\n",
- "# df_all.style.background_gradient()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Baseline scores on donor 0. notnormalized, cell type specific"
+ "# d0_hvgs"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
- "# RUN_ID=\"donor_0_notnormalized_celltype\"\n",
- "# models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n",
- "# df_all = process_data(RUN_ID, models_all)\n",
- "# df_all.style.background_gradient()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Baseline scores on donor 0. Normalized, non cell type specific, hvgs"
+ "methods = [ 'pearson_corr', 'pearson_causal', 'positive_control', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/state.yaml to resources/results/donor_0_normalized_noncelltype_hvg/state.yaml\n",
- "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/trace.txt to resources/results/donor_0_normalized_noncelltype_hvg/trace.txt\n",
- "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/scores.yaml to resources/results/donor_0_normalized_noncelltype_hvg/scores.yaml\n",
- "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_causal.pearson_causal.prediction.csv to resources/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_causal.pearson_causal.prediction.csv\n",
- "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_corr.pearson_corr.prediction.csv to resources/results/donor_0_normalized_noncelltype_hvg/ridge.pearson_corr.pearson_corr.prediction.csv\n",
- "download: s3://openproblems-data/resources/grn/results/donor_0_normalized_noncelltype_hvg/ridge.positive_control.positive_control.prediction.csv to resources/results/donor_0_normalized_noncelltype_hvg/ridge.positive_control.positive_control.prediction.csv\n"
+ "download: s3://openproblems-data/resources/grn/results/d0_hvgs/scores.yaml to resources/results/d0_hvgs/scores.yaml\n",
+ "download: s3://openproblems-data/resources/grn/results/d0_hvgs/trace.txt to resources/results/d0_hvgs/trace.txt\n",
+ "download: s3://openproblems-data/resources/grn/results/d0_hvgs/metric_configs.yaml to resources/results/d0_hvgs/metric_configs.yaml\n",
+ "download: s3://openproblems-data/resources/grn/results/d0_hvgs/state.yaml to resources/results/d0_hvgs/state.yaml\n"
]
},
{
"data": {
"text/html": [
"\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ex(False)_tf(-1) | \n",
- " ex(True)_tf(-1) | \n",
- " static-theta-0.0 | \n",
- " static-theta-0.5 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " pearson_corr | \n",
- " 0.239620 | \n",
- " 0.518217 | \n",
- " 0.529502 | \n",
- " 0.524232 | \n",
- "
\n",
- " \n",
- " pearson_causal | \n",
- " 0.364656 | \n",
- " 0.592457 | \n",
- " 0.741328 | \n",
- " 0.560490 | \n",
- "
\n",
- " \n",
- " positive_control | \n",
- " 0.492563 | \n",
- " 0.681568 | \n",
- " 0.655407 | \n",
- " 0.574608 | \n",
- "
\n",
- " \n",
- "
\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "RUN_ID=\"donor_0_normalized_noncelltype_hvg\" \n",
- "models_all = ['pearson_corr', 'pearson_causal', 'positive_control']\n",
- "df_all = process_data(RUN_ID, models_all)\n",
- "df_all.style.background_gradient()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## individual runs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- " \n",
- " \n",
- " | \n",
- " ex(False)_tf(-1) | \n",
- " ex(True)_tf(-1) | \n",
- " static-theta-0.0 | \n",
- " static-theta-0.5 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " celloracle | \n",
- " 0.254719 | \n",
- " 0.358623 | \n",
- " 0.639556 | \n",
- " 0.580147 | \n",
- "
\n",
- " \n",
- "
\n"
- ],
- "text/plain": [
- ""
- ]
- },
- "execution_count": 4,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "RUN_ID=\"celloracle_d0_hvg\" \n",
- "models_all = ['celloracle']\n",
- "df_all = process_data(RUN_ID, models_all=None)\n",
- "df_all.style.background_gradient()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Donor 1, HVGs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "methods = [ 'pearson_corr', 'pearson_causal', 'positive_control', 'portia', 'ppcor', 'genie3', 'grnboost2', 'scenic', 'scglue', 'celloracle']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
+ "\n",
" \n",
" \n",
" | \n",
- " ex(False)_tf(-1) | \n",
- " ex(True)_tf(-1) | \n",
- " static-theta-0.0 | \n",
- " static-theta-0.5 | \n",
+ " ex(False)_tf(-1) | \n",
+ " ex(True)_tf(-1) | \n",
+ " static-theta-0.0 | \n",
+ " static-theta-0.5 | \n",
"
\n",
" \n",
" \n",
" \n",
- " pearson_corr | \n",
- " 0.239620 | \n",
- " 0.518217 | \n",
- " 0.529502 | \n",
- " 0.524232 | \n",
+ " pearson_corr | \n",
+ " 0.238664 | \n",
+ " 0.514612 | \n",
+ " 0.529502 | \n",
+ " 0.524232 | \n",
"
\n",
" \n",
- " pearson_causal | \n",
- " 0.364656 | \n",
- " 0.592457 | \n",
- " 0.741328 | \n",
- " 0.560490 | \n",
+ " pearson_causal | \n",
+ " 0.355256 | \n",
+ " 0.578753 | \n",
+ " 0.741328 | \n",
+ " 0.560490 | \n",
"
\n",
" \n",
- " positive_control | \n",
- " 0.492563 | \n",
- " 0.681568 | \n",
- " 0.655407 | \n",
- " 0.574608 | \n",
+ " positive_control | \n",
+ " 0.489147 | \n",
+ " 0.677155 | \n",
+ " 0.655407 | \n",
+ " 0.574608 | \n",
"
\n",
" \n",
- " portia | \n",
- " 0.010070 | \n",
- " 0.012556 | \n",
- " 0.451256 | \n",
- " 0.518048 | \n",
+ " portia | \n",
+ " 0.148941 | \n",
+ " 0.227248 | \n",
+ " 0.451256 | \n",
+ " 0.518048 | \n",
"
\n",
" \n",
- " ppcor | \n",
- " 0.009018 | \n",
- " 0.006949 | \n",
- " 0.396680 | \n",
- " 0.509874 | \n",
+ " ppcor | \n",
+ " 0.022846 | \n",
+ " 0.094107 | \n",
+ " 0.396680 | \n",
+ " 0.509874 | \n",
"
\n",
" \n",
- " genie3 | \n",
- " 0.162738 | \n",
- " 0.204269 | \n",
- " 0.754073 | \n",
- " 0.576580 | \n",
+ " genie3 | \n",
+ " 0.372641 | \n",
+ " 0.490357 | \n",
+ " 0.754073 | \n",
+ " 0.576580 | \n",
"
\n",
" \n",
- " grnboost2 | \n",
- " 0.128355 | \n",
- " 0.170497 | \n",
- " 0.781852 | \n",
- " 0.609075 | \n",
+ " grnboost2 | \n",
+ " 0.381032 | \n",
+ " 0.459860 | \n",
+ " 0.781852 | \n",
+ " 0.609075 | \n",
"
\n",
" \n",
- " scenic | \n",
- " 0.137783 | \n",
- " 0.149062 | \n",
- " 0.600839 | \n",
- " 0.574294 | \n",
+ " scenic | \n",
+ " 0.154399 | \n",
+ " 0.206571 | \n",
+ " 0.600839 | \n",
+ " 0.574294 | \n",
"
\n",
" \n",
- " scglue | \n",
- " 0.067017 | \n",
- " 0.200135 | \n",
- " 0.448617 | \n",
- " 0.527076 | \n",
+ " scglue | \n",
+ " 0.078309 | \n",
+ " 0.238859 | \n",
+ " 0.448617 | \n",
+ " 0.527076 | \n",
"
\n",
" \n",
- " celloracle | \n",
- " 0.254719 | \n",
- " 0.358623 | \n",
- " 0.639556 | \n",
- " 0.580147 | \n",
+ " celloracle | \n",
+ " 0.216897 | \n",
+ " 0.311451 | \n",
+ " 0.639556 | \n",
+ " 0.580147 | \n",
"
\n",
" \n",
"
\n"
],
"text/plain": [
- ""
+ ""
]
},
- "execution_count": 11,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
@@ -805,37 +511,397 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Sync"
+ "# Format resourcs used"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 45,
"metadata": {},
"outputs": [
{
- "name": "stdout",
+ "name": "stderr",
"output_type": "stream",
"text": [
- "upload: resources/grn_models/d0_hvgs/binarized/scglue.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/scglue.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/genie3.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/genie3.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/scenic.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/scenic.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/celloracle.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/celloracle.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/portia.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/portia.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/pearson_causal.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/pearson_causal.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/positive_control.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/positive_control.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/grnboost2.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/grnboost2.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/pearson_corr.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/pearson_corr.csv\n",
- "upload: resources/grn_models/d0_hvgs/binarized/ppcor.csv to s3://openproblems-data/resources/grn/grn_models/d0_hvgs/binarized/ppcor.csv\n"
+ "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
+ "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
+ "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
+ "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
+ "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
+ "/vol/tmp/users/jnourisa/ipykernel_2913022/1716007044.py:19: FutureWarning: The 'delim_whitespace' keyword in pd.read_csv is deprecated and will be removed in a future version. Use ``sep='\\s+'`` instead\n",
+ " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n"
]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " JobID | \n",
+ " JobName | \n",
+ " AllocCPUS | \n",
+ " Elapsed | \n",
+ " State | \n",
+ " MaxRSS | \n",
+ " MaxVMSize | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " portia | \n",
+ " 7742241.bat+ | \n",
+ " batch | \n",
+ " 20 | \n",
+ " 0.110556 | \n",
+ " COMPLETED | \n",
+ " 46.943497 | \n",
+ " 47.474384 | \n",
+ "
\n",
+ " \n",
+ " grnboost2 | \n",
+ " 7742249.bat+ | \n",
+ " batch | \n",
+ " 20 | \n",
+ " 1.568056 | \n",
+ " COMPLETED | \n",
+ " 3.067471 | \n",
+ " 3.563801 | \n",
+ "
\n",
+ " \n",
+ " scenic | \n",
+ " 7742283.bat+ | \n",
+ " batch | \n",
+ " 20 | \n",
+ " 1.908056 | \n",
+ " COMPLETED | \n",
+ " 30.356461 | \n",
+ " 32.573463 | \n",
+ "
\n",
+ " \n",
+ " genie3 | \n",
+ " 7742285.bat+ | \n",
+ " batch | \n",
+ " 20 | \n",
+ " 16.682500 | \n",
+ " COMPLETED | \n",
+ " 13.105103 | \n",
+ " 13.563530 | \n",
+ "
\n",
+ " \n",
+ " ppcor | \n",
+ " 7742364.bat+ | \n",
+ " batch | \n",
+ " 20 | \n",
+ " 0.556667 | \n",
+ " COMPLETED | \n",
+ " 3.909119 | \n",
+ " 4.283978 | \n",
+ "
\n",
+ " \n",
+ " scglue | \n",
+ " 7742343.bat+ | \n",
+ " batch | \n",
+ " 20 | \n",
+ " 4.380278 | \n",
+ " FAILED | \n",
+ " 29.917423 | \n",
+ " 35.933720 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " JobID JobName AllocCPUS Elapsed State MaxRSS \\\n",
+ "portia 7742241.bat+ batch 20 0.110556 COMPLETED 46.943497 \n",
+ "grnboost2 7742249.bat+ batch 20 1.568056 COMPLETED 3.067471 \n",
+ "scenic 7742283.bat+ batch 20 1.908056 COMPLETED 30.356461 \n",
+ "genie3 7742285.bat+ batch 20 16.682500 COMPLETED 13.105103 \n",
+ "ppcor 7742364.bat+ batch 20 0.556667 COMPLETED 3.909119 \n",
+ "scglue 7742343.bat+ batch 20 4.380278 FAILED 29.917423 \n",
+ "\n",
+ " MaxVMSize \n",
+ "portia 47.474384 \n",
+ "grnboost2 3.563801 \n",
+ "scenic 32.573463 \n",
+ "genie3 13.563530 \n",
+ "ppcor 4.283978 \n",
+ "scglue 35.933720 "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
}
],
"source": [
- "!aws s3 sync resources/grn-benchmark s3://openproblems-data/resources/grn/grn-benchmark --delete\n",
- "!aws s3 sync resources/grn_models/ s3://openproblems-data/resources/grn/grn_models --delete\n",
- "!aws s3 sync resources/prior/ s3://openproblems-data/resources/grn/prior --delete\n",
- "!aws s3 sync resources/supplementary/ s3://openproblems-data/resources/grn/supplementary --delete\n",
- "!aws s3 sync resources/results/ s3://openproblems-data/resources/grn/results --delete"
+ "# extract trace for local jobs\n",
+ "job_ids = {\n",
+ " 'portia': 7742241,\n",
+ " 'grnboost2': 7742249,\n",
+ " 'scenic': 7742283,\n",
+ " 'genie3': 7742285,\n",
+ " 'ppcor': 7742364,\n",
+ " 'scglue': 7742343,\n",
+ "}\n",
+ "import subprocess\n",
+ "import pandas as pd\n",
+ "import io\n",
+ "\n",
+ "def get_sacct_data(job_id):\n",
+ " command = f'sacct -j {job_id} --format=JobID,JobName,AllocCPUS,Elapsed,State,MaxRSS,MaxVMSize'\n",
+ " output = subprocess.check_output(command, shell=True).decode()\n",
+ " \n",
+ " # Load the output into a DataFrame\n",
+ " df = pd.read_csv(io.StringIO(output), delim_whitespace=True)\n",
+ " df = df.iloc[[2]]\n",
+ " return df\n",
+ "\n",
+ "for i, (name, job_id) in enumerate(job_ids.items()):\n",
+ " df = get_sacct_data(job_id)\n",
+ " df.index = [name]\n",
+ " if i==0:\n",
+ " df_local = df\n",
+ " else:\n",
+ " df_local = pd.concat([df_local, df], axis=0)\n",
+ "def elapsed_to_hours(elapsed_str):\n",
+ " h, m, s = map(int, elapsed_str.split(':'))\n",
+ " return h + m / 60 + s / 3600\n",
+ "# Remove 'K' and convert to integers\n",
+ "df_local['MaxRSS'] = df_local['MaxRSS'].str.replace('K', '').astype(int)\n",
+ "df_local['MaxVMSize'] = df_local['MaxVMSize'].str.replace('K', '').astype(int)\n",
+ "df_local['Elapsed'] = df_local['Elapsed'].apply(lambda x: (elapsed_to_hours(x)))\n",
+ "\n",
+ "\n",
+ "# Convert MaxRSS and MaxVMSize from KB to GB\n",
+ "df_local['MaxRSS'] = df_local['MaxRSS'] / (1024 ** 2) # Convert KB to GB\n",
+ "df_local['MaxVMSize'] = df_local['MaxVMSize'] / (1024 ** 2) # Convert KB to GB\n",
+ "df_local"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " %cpu | \n",
+ " peak_rss | \n",
+ " rchar | \n",
+ " wchar | \n",
+ " duration | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " celloracle | \n",
+ " 799.6 | \n",
+ " 14.9 | \n",
+ " 18.0 | \n",
+ " 86.1 | \n",
+ " 1.472222 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " %cpu peak_rss rchar wchar duration\n",
+ "celloracle 799.6 14.9 18.0 86.1 1.472222"
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\n",
+ "# sequra runs\n",
+ "base_dir = 'resources/results/d0_hvgs_res/'\n",
+ "models = ['celloracle']\n",
+ "\n",
+ "def convert_duration_to_hours(duration_str):\n",
+ " import re\n",
+ " hours, minutes, seconds = 0, 0, 0\n",
+ " time_parts = re.findall(r'(\\d+)([hms])', duration_str)\n",
+ " for value, unit in time_parts:\n",
+ " if unit == 'h':\n",
+ " hours = int(value)\n",
+ " elif unit == 'm':\n",
+ " minutes = int(value)\n",
+ " elif unit == 's':\n",
+ " seconds = int(value)\n",
+ " return (hours * 3600 + minutes * 60 + seconds)/3600\n",
+ "def format_gb(gb_str):\n",
+ " gb_value = float(gb_str.split()[0])\n",
+ " return gb_value \n",
+ "\n",
+ "\n",
+ "for i, method in enumerate(models):\n",
+ " df = pd.read_csv(f'{base_dir}/{method}.txt', sep='\\t')[cols]\n",
+ " df = df.drop(1)\n",
+ "\n",
+ "\n",
+ " for col in cols:\n",
+ " if col=='%cpu':\n",
+ " df[col] = df[col].str.replace('%', '').astype(float)\n",
+ " elif col=='duration':\n",
+ " df[col] = df[col].apply(convert_duration_to_hours)\n",
+ " else:\n",
+ " df[col] = df[col].apply(format_gb)\n",
+ " df = df.sort_values(by='duration', ascending=False).iloc[[0]]\n",
+ " \n",
+ " df.index = [method]\n",
+ " if i==0:\n",
+ " df_sequera = df \n",
+ " else:\n",
+ " df_sequera = pd.concat([df_sequera, df], axis=0)\n",
+ "df_sequera"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Peak memory | \n",
+ " Duration | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " celloracle | \n",
+ " 14.900000 | \n",
+ " 1.472222 | \n",
+ "
\n",
+ " \n",
+ " portia | \n",
+ " 46.943497 | \n",
+ " 0.110556 | \n",
+ "
\n",
+ " \n",
+ " grnboost2 | \n",
+ " 3.067471 | \n",
+ " 1.568056 | \n",
+ "
\n",
+ " \n",
+ " scenic | \n",
+ " 30.356461 | \n",
+ " 1.908056 | \n",
+ "
\n",
+ " \n",
+ " genie3 | \n",
+ " 13.105103 | \n",
+ " 16.682500 | \n",
+ "
\n",
+ " \n",
+ " ppcor | \n",
+ " 3.909119 | \n",
+ " 0.556667 | \n",
+ "
\n",
+ " \n",
+ " scglue | \n",
+ " 29.917423 | \n",
+ " 4.380278 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Peak memory Duration\n",
+ "celloracle 14.900000 1.472222\n",
+ "portia 46.943497 0.110556\n",
+ "grnboost2 3.067471 1.568056\n",
+ "scenic 30.356461 1.908056\n",
+ "genie3 13.105103 16.682500\n",
+ "ppcor 3.909119 0.556667\n",
+ "scglue 29.917423 4.380278"
+ ]
+ },
+ "execution_count": 48,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# merge local and cluster dfs \n",
+ "map_dict = {'peak_rss':'MaxRSS', 'duration':'Elapsed'}\n",
+ "\n",
+ "df_local = df_local[map_dict.values()]\n",
+ "df_sequera = df_sequera[map_dict.keys()]\n",
+ "\n",
+ "df_sequera.columns = df_sequera.columns.map(map_dict)\n",
+ "\n",
+ "\n",
+ "df_res = pd.concat([df_sequera, df_local], axis=0)\n",
+ "\n",
+ "df_res.columns = ['Peak memory (GB)', 'Duration (hour)']\n",
+ "\n",
+ "df_res"
]
},
{
diff --git a/scripts/run_grn_evaluation.sh b/scripts/run_grn_evaluation.sh
index 20c060b21..878afe48c 100644
--- a/scripts/run_grn_evaluation.sh
+++ b/scripts/run_grn_evaluation.sh
@@ -1,13 +1,11 @@
#!/bin/bash
-# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-# viash ns build --parallel
-
-RUN_ID="d0_hvgs_binarized"
+RUN_ID="d0_hvgs"
+echo $RUN_ID
resources_dir="s3://openproblems-data/resources/grn"
# resources_dir="./resources"
publish_dir="${resources_dir}/results/${RUN_ID}"
-grn_models_folder="${resources_dir}/grn_models/d0_hvgs/binarized"
+grn_models_folder="${resources_dir}/grn_models/d0_hvgs"
reg_type="ridge"
subsample=-2
@@ -41,11 +39,13 @@ grn_names=(
"ppcor"
"scenic"
"portia"
-
+
+ "negative_control"
"positive_control"
"pearson_causal"
"pearson_corr"
-
+
+ "collectri"
)
# Start writing to the YAML file
diff --git a/src/control_methods/negative_control/script.py b/src/control_methods/negative_control/script.py
index fce2276d6..2c62a4f5f 100644
--- a/src/control_methods/negative_control/script.py
+++ b/src/control_methods/negative_control/script.py
@@ -6,16 +6,17 @@
## VIASH START
par = {
"perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
- "prediction": "output/negative_control.csv",
- "tf_all": "resources/prior/tf_all.csv"
+ "prediction": "resources/grn_models/default/negative_control.csv",
+ "tf_all": "resources/prior/tf_all.csv",
+ "max_n_links": 50000
}
## VIASH END
print(par)
def process_links(net, par):
net = net[net.source!=net.target]
- net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
- net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
+ net = net.sample(par['max_n_links'])
+ print(net)
return net
print('Reading input data')
@@ -23,7 +24,7 @@ def process_links(net, par):
gene_names = perturbation_data.var_names.to_numpy()
tf_all = np.loadtxt(par['tf_all'], dtype=str)
-n_tf = 1200
+n_tf = 500
tfs = tf_all[:n_tf]
def create_negative_control(gene_names) -> np.ndarray:
diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
index 97ef66ae6..6a1da25d4 100644
--- a/src/metrics/regression_1/main.py
+++ b/src/metrics/regression_1/main.py
@@ -148,7 +148,8 @@ def regression_1(
if (len(net_sub)>par['max_n_links']) and (par['max_n_links']!=-1):
net_sub = process_links(net_sub, par) #only top links are considered
verbose_print(par['verbose'], f"Number of links reduced to {par['max_n_links']}", 2)
-
+ if par['binarize']:
+ net['weight'] = net['weight'].apply(binarize_weight)
prturb_adata_sub = prturb_adata[prturb_adata.obs.cell_type==cell_type,:]
y_true_sub, y_pred_sub = cross_validation(net_sub, prturb_adata_sub, par)
@@ -210,8 +211,7 @@ def main(par):
net = pd.read_csv(par['prediction'])
# net['weight'] = net.weight.abs()
# subset to keep only those links with source as tf
- if par['binarize']:
- net['weight'] = net['weight'].apply(binarize_weight)
+
if par['apply_tf']:
net = net[net.source.isin(tf_all)]
# if 'cell_type' in net.columns:
diff --git a/src/process_data/multiomics/format_data/config.vsh.yaml b/src/process_data/multiomics/format_data/config.vsh.yaml
index c07cda267..220dfe537 100644
--- a/src/process_data/multiomics/format_data/config.vsh.yaml
+++ b/src/process_data/multiomics/format_data/config.vsh.yaml
@@ -10,17 +10,29 @@ functionality:
type: file
required: false
direction: input
- default: resources/datasets_raw/multiome_counts.h5ad
+ example: resources/datasets_raw/multiome_counts.h5ad
- name: --multiomics_rna
type: file
required: false
direction: output
- default: resources/grn-benchmark/multiomics_rna.h5ad
+ example: resources/grn-benchmark/multiomics_rna.h5ad
+ - name: --multiomics_rna_d0
+ type: file
+ required: false
+ direction: output
+ example: resources/grn-benchmark/multiomics_rna_d0.h5ad
+
+ - name: --multiomics_rna_d0_hvg
+ type: file
+ required: false
+ direction: output
+ example: resources/grn-benchmark/multiomics_rna_d0_hvg.h5ad
+
- name: --multiomics_atac
type: file
required: false
direction: output
- default: resources/grn-benchmark/multiomics_atac.h5ad
+ example: resources/grn-benchmark/multiomics_atac.h5ad
resources:
- type: python_script
path: script.py
diff --git a/src/process_data/multiomics/format_data/script.py b/src/process_data/multiomics/format_data/script.py
index 98ebe7802..de8a2af35 100644
--- a/src/process_data/multiomics/format_data/script.py
+++ b/src/process_data/multiomics/format_data/script.py
@@ -1,4 +1,5 @@
import anndata as ad
+import scanpy as sc
## VIASH START
par = {
# 'multiome_counts': 'resources/datasets_raw/multiome_counts.h5ad',
@@ -25,7 +26,23 @@
multiomics_rna = multiomics[:,multiomics.var.feature_types=='Gene Expression']
multiomics_rna.var = multiomics_rna.var[['gene_ids', 'interval']]
-# ATAC
+def high_coverage(adata):
+ threshold = 0.1
+ mask = adata.X!=0
+ mask_obs = (np.sum(mask, axis=1).A.flatten()/mask.shape[1])>threshold
+ mask_var = (np.sum(mask, axis=0).A.flatten()/mask.shape[0])>threshold
+ adata.obs['high_coverage'] = mask_obs
+ adata.var['high_coverage'] = mask_var
+high_coverage(multiomics_rna)
+
+# hvgs
+var = sc.pp.highly_variable_genes(multiomics_rna, flavor='seurat_v3', n_top_genes=7000, inplace=False)
+multiomics_rna.var['highly_variable'] = var.highly_variable
+
+# subset to donor 0
+multiomics_rna_d0 = multiomics_rna[multiomics_rna.obs.donor_id=='donor_0', :]
+multiomics_rna_d0_hvg = multiomics_rna[multiomics_rna.obs.donor_id=='donor_0', multiomics_rna.var.highly_variable]
+#------ ATAC
multiomics_atac = multiomics[:,multiomics.var.feature_types=='Peaks']
multiomics_atac.var = multiomics_atac.var[[]]
@@ -45,4 +62,6 @@
multiomics_atac.obs['donor_id'] = multiomics_atac.obs['donor_id'].map(donor_map)
multiomics_rna.write(par['multiomics_rna'])
+multiomics_rna_h0.write(par['multiomics_rna_h0'])
+multiomics_rna_h0_hvg.write(par['multiomics_rna_h0_hvg'])
multiomics_atac.write(par['multiomics_atac'])
\ No newline at end of file