Skip to content

Commit

Permalink
bug in scgpt fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Nov 29, 2024
1 parent ec8efd8 commit 734712f
Show file tree
Hide file tree
Showing 7 changed files with 132 additions and 80 deletions.
53 changes: 17 additions & 36 deletions runs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,12 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"upload: resources_test/datasets_raw/op_perturbation_sc_counts.h5ad to s3://openproblems-data/resources_test/grn/datasets_raw/op_perturbation_sc_counts.h5ad\n",
"upload: resources_test/datasets_raw/op_multiome_sc_counts.h5ad to s3://openproblems-data/resources_test/grn/datasets_raw/op_multiome_sc_counts.h5ad\n",
"upload: resources_test/inference_datasets/op_rna.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_rna.h5ad\n",
"upload: resources_test/evaluation_datasets/op_perturbation.h5ad to s3://openproblems-data/resources_test/grn/evaluation_datasets/op_perturbation.h5ad\n",
"upload: resources_test/inference_datasets/op_atac.h5ad to s3://openproblems-data/resources_test/grn/inference_datasets/op_atac.h5ad\n"
]
}
],
"outputs": [],
"source": [
"!aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n",
"# !aws s3 sync resources/ s3://openproblems-data/resources/grn/ --delete"
"# !aws s3 sync resources_test/ s3://openproblems-data/resources_test/grn/ --delete\n",
"!aws s3 sync resources/inference_datasets/ s3://openproblems-data/resources/grn/inference_datasets/ --delete"
]
},
{
Expand Down Expand Up @@ -172,7 +160,15 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Submitted batch job 7852664\n"
]
}
],
"source": [
"from src.helper import calculate_scores\n",
"if False: # consensus: run this after updating grns\n",
Expand Down Expand Up @@ -4186,32 +4182,17 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# repo"
"# Effect of pseudobulking on performance"
]
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# if par['metacell']:\n",
"# print('metacell')\n",
"# def create_meta_cells(df, n_cells=15):\n",
"# meta_x = []\n",
"# for i in range(0, df.shape[0], n_cells):\n",
"# meta_x.append(df.iloc[i:i+n_cells, :].sum(axis=0).values)\n",
"# df = pd.DataFrame(meta_x, columns=df.columns)\n",
"# return df\n",
" \n",
"# adata_df = pd.DataFrame(multiomics_rna.X.todense(), columns=multiomics_rna.var_names)\n",
"# adata_df['cell_type'] = multiomics_rna.obs['cell_type'].values\n",
"# adata_df['donor_id'] = multiomics_rna.obs['donor_id'].values\n",
"# df = adata_df.groupby(['cell_type','donor_id']).apply(lambda df: create_meta_cells(df))\n",
"# X = df.values\n",
"# var = pd.DataFrame(index=df.columns)\n",
"# obs = df.reset_index()[['cell_type','donor_id']]\n",
"# multiomics_rna = ad.AnnData(X=X, obs=obs, var=var)"
"from src.helper import analyse_meta_cells\n",
"analyse_meta_cells(task_grn_inference_dir='./')"
]
}
],
Expand Down
11 changes: 11 additions & 0 deletions scripts/sbatch/run_helper.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
#SBATCH --job-name=helper
#SBATCH --time=10:00:00
#SBATCH --output=logs/%j.out
#SBATCH --error=logs/%j.err
#SBATCH --mail-type=END
#SBATCH [email protected]
#SBATCH --mem=64G
#SBATCH --cpus-per-task=20

python src/helper.py
88 changes: 86 additions & 2 deletions src/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,89 @@ def process_seqera_trace():
df_res


def analyse_meta_cells(task_grn_inference_dir):
dataset = 'op' #'nakatake' #op', norman


par = {
'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad',
"evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad",

'layer': 'X_norm',
'consensus': f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json',
'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv',
'apply_tf': True,
'apply_skeleton': False,
'verbose': 2,
'max_n_links': 50_000,
'temp_dir': 'output/metacells/',
'subsample': -1,
'num_workers': 4,
'binarize': True,
'reg_type': 'ridge'
}

os.makedirs(par['temp_dir'], exist_ok=True)
# - imports
sys.path.append(f'{task_grn_inference_dir}/')
sys.path.append(f'{task_grn_inference_dir}/src/utils')

from src.metrics.regression_1.main import main as main_reg1
from src.metrics.regression_2.main import main as main_reg2

from util import corr_net

# - read inputs and cluster with differen resolutions
rna = ad.read_h5ad(par['rna'])
rna.X = rna.layers[par['layer']]

sc.pp.pca(rna)
sc.pp.neighbors(rna)

for res in range(1, 20, 2):
sc.tl.leiden(rna, resolution=res, key_added=f'leiden_{res}')
rna.write(f"{par['temp_dir']}/rna.h5ad")
# - pseudobulkd and run per res
gene_names = rna.var_names
for i_run, res in enumerate([-1]+list(range(1, 20, 2))):
if res == -1:
X = rna.X
else:
cluster_key = f'leiden_{res}'
expression_data = rna.to_df() # Converts AnnData object to a DataFrame (cells × genes)

# Add the cluster assignments to the expression DataFrame
expression_data[cluster_key] = rna.obs[cluster_key].values

# Group by cluster and calculate the mean expression per gene
pseudobulk = expression_data.groupby(cluster_key).mean()
#- create anndata
adata_pseudobulk = ad.AnnData(
X=pseudobulk.values, # Gene expression matrix (clusters × genes)
obs=pd.DataFrame(index=pseudobulk.index), # Observations (clusters)
var=pd.DataFrame(index=pseudobulk.columns) # Variables (genes)
)
X = adata_pseudobulk.X
# - run corr
net = corr_net(X, gene_names, par)
par['prediction'] = f"{par['temp_dir']}/net_{res}.csv"
net.to_csv(par['prediction'])

# - regression 1 and 2
scores_reg1 = main_reg1(par)

scores_reg2 = main_reg2(par)

scores = pd.concat([scores_reg1, scores_reg2], axis=1)
scores.index = [res]

if i_run == 0:
scores_all = scores
else:
scores_all = pd.concat([scores_all, scores], axis=0)
print(scores_all)
scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv")

def check_scores(par):
df_scores = pd.read_csv(f"resources/scores/scgen_pearson-ridge.csv", index_col=0)
df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))
Expand Down Expand Up @@ -287,5 +370,6 @@ def reformat_data(df_local):


if __name__ == '__main__':
run_grn_inference()
# calculate_scores()
# run_grn_inference()
# calculate_scores()
analyse_meta_cells(task_grn_inference_dir='./')
4 changes: 2 additions & 2 deletions src/methods/single_omics/scgpt/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@

## VIASH START
par = {
'multiomics_rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad',
'rna': 'resources_test/grn-benchmark/multiomics_rna.h5ad',
'tf_all': 'resources_test/prior/tf_all.csv',
'prediction': 'output/prediction_scgpt.csv',
'max_n_links': 50000,
Expand Down Expand Up @@ -168,7 +168,7 @@ def monitor_memory():

print('Process rna-seq file')
import scanpy as sc
adata = sc.read(par['multiomics_rna'])
adata = sc.read(par['rna'])
adata.X = adata.X.todense()
adata.obs["celltype"] = adata.obs["cell_type"].astype("category")
adata.obs["str_batch"] = adata.obs["donor_id"].astype(str)
Expand Down
37 changes: 7 additions & 30 deletions src/metrics/script_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@
import os


def define_par(dataset, global_models=False):
def define_par(dataset):

par = {
'reg_type': 'ridge',
'models_dir': f"resources/grn_models/{dataset}",
'scores_dir': f"output/temp/{dataset}",
'scores_dir': f"resources/scores/{dataset}",

'models': [ 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],

Expand Down Expand Up @@ -45,32 +45,7 @@ def define_par(dataset, global_models=False):
'verbose': 4,
'num_workers': 20
}
if global_models:
import shutil

temp_grn_dir = 'output/models/'
os.makedirs(temp_grn_dir, exist_ok=True)

grn_file_list = []
for model in par['global_models']:
grn_file = f"{par['global_models_dir']}/{model}.csv"
grn_file_list.append(grn_file)

for model in par['models']:
grn_file = f"{par['models_dir']}/{model}.csv"
grn_file_list.append(grn_file)

par['models'] = par['models'] + par['global_models']
par['models_dir'] = temp_grn_dir
par['consensus'] = f'{temp_grn_dir}/{dataset}_consensus-num-regulators.json'
for grn_file in grn_file_list:
try:
shutil.copy(grn_file, temp_grn_dir)
print(f"Copied {grn_file} to {temp_grn_dir}")
except FileNotFoundError:
print(f"File not found: {grn_file}")
except Exception as e:
print(f"Error copying {grn_file}: {e}")
return par


Expand All @@ -89,16 +64,18 @@ def define_par(dataset, global_models=False):
from consensus.script import main as main_consensus

# - run general models
global_models = True
global_models = False

# - run metrics
for dataset in ['op']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson'
print('------ ', dataset, '------')
par = define_par(dataset)
os.makedirs(par['scores_dir'], exist_ok=True)
par = define_par(dataset, global_models=global_models)
main_consensus(par)
for binarize in [True]:
if global_models:
par['models'] = par['global_models']
par['models_dir'] = par['global_models_dir']
for binarize in [False]:
par['binarize'] = binarize
for max_n_links in [50000]:
par['max_n_links'] = max_n_links
Expand Down
17 changes: 8 additions & 9 deletions src/robustness_analysis/script_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,27 +7,26 @@

par = {
'reg_type': 'ridge',
'read_dir': "resources/grn_models/",
'read_dir': "resources/grn_models/op/",
'write_dir': "resources/results/robustness_analysis",
# 'degrees': [0, 10, 20, 50, 100],
'degrees': [50],
'degrees': [0, 10, 20, 50, 100],
# 'degrees': [50],
'noise_types': ["net", "sign"],
# 'noise_types': ['weight'],
'methods': [ 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],
'methods': ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],

"perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
"evaluation_data": "resources/evaluation_datasets/op_perturbation.h5ad",
"tf_all": "resources/prior/tf_all.csv",
"max_n_links": 50000,
"apply_tf": True,
'binarize': False,
'subsample': -1,
'verbose': 0,
'num_workers': 20,
'consensus': 'resources/prior/consensus-num-regulators.json',
'consensus': 'resources/prior/op_consensus-num-regulators.json',
'static_only': True,
'clip_scores': True,
'layer': 'pearson',
'apply_skeleton': True,
'layer': 'X_norm',
'apply_skeleton': False,
'skeleton': 'resources/prior/skeleton.csv'
}

Expand Down
2 changes: 1 addition & 1 deletion src/utils/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def efficient_melting(net, gene_names, par):
data = np.column_stack((targets, sources, weights))

# Convert to DataFrame
print('conver to df')
print('convert to long dataframe')
net = pd.DataFrame(data, columns=['target', 'source', 'weight'])
return net

Expand Down

0 comments on commit 734712f

Please sign in to comment.