diff --git a/runs.ipynb b/runs.ipynb index 6c68ca39..ea20db9e 100644 --- a/runs.ipynb +++ b/runs.ipynb @@ -26,24 +26,12 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "upload: resources_test/datasets_raw/op_perturbation_sc_counts.h5ad to s3://openproblems-data/resources_test/grn/datasets_raw/op_perturbation_sc_counts.h5ad\n", - "upload: resources_test/datasets_raw/op_multiome_sc_counts.h5ad to s3://openproblems-data/resources_test/grn/datasets_raw/op_multiome_sc_counts.h5ad\n", - "upload: resources_test/inference_datasets/op_rna.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_rna.h5ad\n", - "upload: resources_test/evaluation_datasets/op_perturbation.h5ad to s3://openproblems-data/resources_test/grn/evaluation_datasets/op_perturbation.h5ad\n", - "upload: resources_test/inference_datasets/op_atac.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_atac.h5ad\n" - ] - } - ], + "outputs": [], "source": [ - "!aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n", - "# !aws s3 sync resources/ s3://openproblems-data/resources/grn/ --delete" + "# !aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n", + "!aws s3 sync resources/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete" ] }, { @@ -172,7 +160,15 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Submitted batch job 7852664\n" + ] + } + ], "source": [ "from src.helper import calculate_scores\n", "if False: # consensus: run this after updating grns\n", @@ -4186,32 +4182,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# repo" + "# Effect of pseudobulking on performance" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# if par['metacell']:\n", - "# print('metacell')\n", - "# def create_meta_cells(df, n_cells=15):\n", - "# meta_x = []\n", - "# for i in range(0, df.shape[0], n_cells):\n", - "# meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values)\n", - "# df = pd.DataFrame(meta_x, columns=df.columns)\n", - "# return df\n", - " \n", - "# adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names)\n", - "# adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values\n", - "# adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values\n", - "# df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df))\n", - "# X = df.values\n", - "# var = pd.DataFrame(index=df.columns)\n", - "# obs = df.reset_index()[['cell_type','donor_id']]\n", - "# multiomics_rna = ad.AnnData(X=X, obs=obs, var=var)" + "from src.helper import analyse_meta_cells\n", + "analyse_meta_cells(task_grn_inference_dir='./')" ] } ], diff --git a/scripts/sbatch/run_helper.sh b/scripts/sbatch/run_helper.sh new file mode 100644 index 00000000..49fa4c36 --- /dev/null +++ b/scripts/sbatch/run_helper.sh @@ -0,0 +1,11 @@ +#!/bin/bash +#SBATCH --job-name=helper +#SBATCH --time=10:00:00 +#SBATCH --output=logs/%j.out +#SBATCH --error=logs/%j.err +#SBATCH --mail-type=END +#SBATCH --mail-user=jalil.nourisa@gmail.com +#SBATCH --mem=64G +#SBATCH --cpus-per-task=20 + +python src/helper.py diff --git a/src/helper.py b/src/helper.py index 6135ed6a..776d1631 100644 --- a/src/helper.py +++ b/src/helper.py @@ -35,6 +35,89 @@ def process_seqera_trace(): df_res +def analyse_meta_cells(task_grn_inference_dir): + dataset = 'op' #'nakatake' #op', norman + + + par = { + 'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad', + "evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad", + + 'layer': 'X_norm', + 'consensus': f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json', + 'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv', + 'apply_tf': True, + 'apply_skeleton': False, + 'verbose': 2, + 'max_n_links': 50_000, + 'temp_dir': 'output/metacells/', + 'subsample': -1, + 'num_workers': 4, + 'binarize': True, + 'reg_type': 'ridge' + } + + os.makedirs(par['temp_dir'], exist_ok=True) + # - imports + sys.path.append(f'{task_grn_inference_dir}/') + sys.path.append(f'{task_grn_inference_dir}/src/utils') + + from src.metrics.regression_1.main import main as main_reg1 + from src.metrics.regression_2.main import main as main_reg2 + + from util import corr_net + + # - read inputs and cluster with differen resolutions + rna = ad.read_h5ad(par['rna']) + rna.X = rna.layers[par['layer']] + + sc.pp.pca(rna) + sc.pp.neighbors(rna) + + for res in range(1, 20, 2): + sc.tl.leiden(rna, resolution=res, key_added=f'leiden_{res}') + rna.write(f"{par['temp_dir']}/rna.h5ad") + # - pseudobulkd and run per res + gene_names = rna.var_names + for i_run, res in enumerate([-1]+list(range(1, 20, 2))): + if res == -1: + X = rna.X + else: + cluster_key = f'leiden_{res}' + expression_data = rna.to_df() # Converts AnnData object to a DataFrame (cells × genes) + + # Add the cluster assignments to the expression DataFrame + expression_data[cluster_key] = rna.obs[cluster_key].values + + # Group by cluster and calculate the mean expression per gene + pseudobulk = expression_data.groupby(cluster_key).mean() + #- create anndata + adata_pseudobulk = ad.AnnData( + X=pseudobulk.values, # Gene expression matrix (clusters × genes) + obs=pd.DataFrame(index=pseudobulk.index), # Observations (clusters) + var=pd.DataFrame(index=pseudobulk.columns) # Variables (genes) + ) + X = adata_pseudobulk.X + # - run corr + net = corr_net(X, gene_names, par) + par['prediction'] = f"{par['temp_dir']}/net_{res}.csv" + net.to_csv(par['prediction']) + + # - regression 1 and 2 + scores_reg1 = main_reg1(par) + + scores_reg2 = main_reg2(par) + + scores = pd.concat([scores_reg1, scores_reg2], axis=1) + scores.index = [res] + + if i_run == 0: + scores_all = scores + else: + scores_all = pd.concat([scores_all, scores], axis=0) + print(scores_all) + scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv") + def check_scores(par): df_scores = pd.read_csv(f"resources/scores/scgen_pearson-ridge.csv", index_col=0) df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0)) @@ -287,5 +370,6 @@ def reformat_data(df_local): if __name__ == '__main__': - run_grn_inference() - # calculate_scores() \ No newline at end of file + # run_grn_inference() + # calculate_scores() + analyse_meta_cells(task_grn_inference_dir='./') diff --git a/src/methods/single_omics/scgpt/script.py b/src/methods/single_omics/scgpt/script.py index 211c34eb..6cea4ac7 100644 --- a/src/methods/single_omics/scgpt/script.py +++ b/src/methods/single_omics/scgpt/script.py @@ -47,7 +47,7 @@ ## VIASH START par = { - 'multiomics_rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad', + 'rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad', 'tf_all': 'resources_test/prior/tf_all.csv', 'prediction': 'output/prediction_scgpt.csv', 'max_n_links': 50000, @@ -168,7 +168,7 @@ def monitor_memory(): print('Process rna-seq file') import scanpy as sc -adata = sc.read(par['multiomics_rna']) +adata = sc.read(par['rna']) adata.X = adata.X.todense() adata.obs["celltype"] = adata.obs["cell_type"].astype("category") adata.obs["str_batch"] = adata.obs["donor_id"].astype(str) diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py index 25f1ff6f..f5410e96 100644 --- a/src/metrics/script_all.py +++ b/src/metrics/script_all.py @@ -5,12 +5,12 @@ import os -def define_par(dataset, global_models=False): +def define_par(dataset): par = { 'reg_type': 'ridge', 'models_dir': f"resources/grn_models/{dataset}", - 'scores_dir': f"output/temp/{dataset}", + 'scores_dir': f"resources/scores/{dataset}", 'models': [ 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], @@ -45,32 +45,7 @@ def define_par(dataset, global_models=False): 'verbose': 4, 'num_workers': 20 } - if global_models: - import shutil - - temp_grn_dir = 'output/models/' - os.makedirs(temp_grn_dir, exist_ok=True) - grn_file_list = [] - for model in par['global_models']: - grn_file = f"{par['global_models_dir']}/{model}.csv" - grn_file_list.append(grn_file) - - for model in par['models']: - grn_file = f"{par['models_dir']}/{model}.csv" - grn_file_list.append(grn_file) - - par['models'] = par['models'] + par['global_models'] - par['models_dir'] = temp_grn_dir - par['consensus'] = f'{temp_grn_dir}/{dataset}_consensus-num-regulators.json' - for grn_file in grn_file_list: - try: - shutil.copy(grn_file, temp_grn_dir) - print(f"Copied {grn_file} to {temp_grn_dir}") - except FileNotFoundError: - print(f"File not found: {grn_file}") - except Exception as e: - print(f"Error copying {grn_file}: {e}") return par @@ -89,16 +64,18 @@ def define_par(dataset, global_models=False): from consensus.script import main as main_consensus # - run general models -global_models = True +global_models = False # - run metrics for dataset in ['op']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson' print('------ ', dataset, '------') par = define_par(dataset) os.makedirs(par['scores_dir'], exist_ok=True) - par = define_par(dataset, global_models=global_models) main_consensus(par) - for binarize in [True]: + if global_models: + par['models'] = par['global_models'] + par['models_dir'] = par['global_models_dir'] + for binarize in [False]: par['binarize'] = binarize for max_n_links in [50000]: par['max_n_links'] = max_n_links diff --git a/src/robustness_analysis/script_all.py b/src/robustness_analysis/script_all.py index aeecb75e..c9553f00 100644 --- a/src/robustness_analysis/script_all.py +++ b/src/robustness_analysis/script_all.py @@ -7,15 +7,15 @@ par = { 'reg_type': 'ridge', - 'read_dir': "resources/grn_models/", + 'read_dir': "resources/grn_models/op/", 'write_dir': "resources/results/robustness_analysis", - # 'degrees': [0, 10, 20, 50, 100], - 'degrees': [50], + 'degrees': [0, 10, 20, 50, 100], + # 'degrees': [50], 'noise_types': ["net", "sign"], # 'noise_types': ['weight'], - 'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], + 'methods': ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'], - "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad", + "evaluation_data": "resources/evaluation_datasets/op_perturbation.h5ad", "tf_all": "resources/prior/tf_all.csv", "max_n_links": 50000, "apply_tf": True, @@ -23,11 +23,10 @@ 'subsample': -1, 'verbose': 0, 'num_workers': 20, - 'consensus': 'resources/prior/consensus-num-regulators.json', + 'consensus': 'resources/prior/op_consensus-num-regulators.json', 'static_only': True, - 'clip_scores': True, - 'layer': 'pearson', - 'apply_skeleton': True, + 'layer': 'X_norm', + 'apply_skeleton': False, 'skeleton': 'resources/prior/skeleton.csv' } diff --git a/src/utils/util.py b/src/utils/util.py index 7dd52760..00eb5f4b 100644 --- a/src/utils/util.py +++ b/src/utils/util.py @@ -69,7 +69,7 @@ def efficient_melting(net, gene_names, par): data = np.column_stack((targets, sources, weights)) # Convert to DataFrame - print('conver to df') + print('convert to long dataframe') net = pd.DataFrame(data, columns=['target', 'source', 'weight']) return net