Skip to content

Commit

Permalink
changes to permutations
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Dec 23, 2024
1 parent af362dc commit d2539c6
Show file tree
Hide file tree
Showing 9 changed files with 402 additions and 1,166 deletions.
1,278 changes: 241 additions & 1,037 deletions runs.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/exp_analysis/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def jaccard_similarity_net(nets_dict, col_name='link', figsize=(4, 4), title='ja

return jaccard_matrix, fig

def plot_cumulative_density(data, ax=None, title=None, label=None, s=3, x_label='Weight', **kwdgs):
def plot_cumulative_density(data, ax=None, title=None, label=None, x_label='Weight', **kwdgs):
# Sort the data
sorted_data = np.sort(data)
# Compute the cumulative density values
Expand Down
151 changes: 96 additions & 55 deletions src/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,15 @@ def analyse_imputation(task_grn_inference_dir):
sc.pp.neighbors(rna)

X_original = rna.X.todense().A


# - pseudobulkd and run per res
gene_names = rna.var_names
for i_run, method in enumerate(['knn', 'softimpute', -1]):
for i_run, method in enumerate(['magic', 'knn', -1]):
if method == -1:
X = X_original
X = rna.X.todense().A
elif method == 'knn':
X_original[X_original == 0] = np.nan
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=10)
X = knn_imputer.fit_transform(X_original)
Expand Down Expand Up @@ -197,6 +200,95 @@ def analyse_imputation(task_grn_inference_dir):
scores_all = pd.concat([scores_all, scores], axis=0)
print(scores_all)
scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv")

def analyse_corr_vs_tfmasked_corr(task_grn_inference_dir):
for i_run, dataset in enumerate(['op', 'replogle2', 'nakatake', 'norman', 'adamson']):
par = {
'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad',
"evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad",

'layer': 'X_norm',
'consensus': f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json',
'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv',
'apply_skeleton': False,
'verbose': 2,
'max_n_links': 50_000,
'temp_dir': 'output/causal_vs_corr/',
'subsample': -1,
'num_workers': 4,
'binarize': True,
'reg_type': 'ridge'
}

os.makedirs(par['temp_dir'], exist_ok=True)
# - imports
sys.path.append(f'{task_grn_inference_dir}/')
sys.path.append(f'{task_grn_inference_dir}/src/utils')

from src.metrics.regression_1.main import main as main_reg1
from src.metrics.regression_2.main import main as main_reg2

from util import efficient_melting, process_links

# - read inputs and cluster with differen resolutions
rna = ad.read_h5ad(par['rna'])
gene_names = rna.var_names.to_numpy()
try:
rna.X = rna.layers[par['layer']]
except:
raise ValueError(f'{dataset} doesnt have X_norm')
X = rna.X
import scipy.sparse
# Assert that rna.X is not a sparse matrix
if scipy.sparse.issparse(X):
X = X.todense().A

def calculate_corr(X, par):
net = np.corrcoef(X.T)

# # Convert to a DataFrame with gene names as both row and column indices
net = pd.DataFrame(net, index=gene_names, columns=gene_names)

net = net.values

net = efficient_melting(net, gene_names, par)
if par['apply_tf']:
tf_all = np.loadtxt(par['tf_all'], dtype=str)
tf_all = np.intersect1d(tf_all, gene_names)
print('TF subsetting')
net = net[net.source.isin(tf_all)]

net = process_links(net, par)
return net
def calculate_scores(par):
scores_reg1 = main_reg1(par)
scores_reg2 = main_reg2(par)
scores = pd.concat([scores_reg1, scores_reg2], axis=1)
return scores
# - causal net
par['apply_tf'] = True
net = calculate_corr(X, par)
par['prediction'] = f"{par['temp_dir']}/net_causal.csv"
net.to_csv(par['prediction'])
scores_causal = calculate_scores(par)
scores_causal['type'] = ['causal']
# - corr net
par['apply_tf'] = False
net = calculate_corr(X, par)
par['prediction'] = f"{par['temp_dir']}/net_corr.csv"
net.to_csv(par['prediction'])
scores_corr = calculate_scores(par)
scores_corr['type']= ['corr']

scores_all = pd.concat([scores_causal, scores_corr], axis=0)

scores_all['dataset'] = dataset
if i_run == 0:
scores_all_datasets = scores_all
else:
scores_all_datasets = pd.concat([scores_all, scores_all_datasets], axis=0)
print(scores_all)
scores_all_datasets.to_csv(f"{par['temp_dir']}/scores_all.csv")
def check_scores(par):
df_scores = pd.read_csv(f"resources/scores/scgen_pearson-ridge.csv", index_col=0)
df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))
Expand Down Expand Up @@ -393,64 +485,13 @@ def format_ram(row):
else:
trace[col] = trace[col].apply(format_ram)
return trace
def process_trace_local(job_ids_dict):
def get_sacct_data(job_id):
command = f'sacct -j {job_id} --format=JobID,JobName,AllocCPUS,Elapsed,State,MaxRSS,MaxVMSize'
output = subprocess.check_output(command, shell=True).decode()

# Load the output into a DataFrame
df = pd.read_csv(io.StringIO(output), delim_whitespace=True)
df = df.iloc[[2]]
return df
def elapsed_to_hours(elapsed_str):
time = elapsed_str.split('-')
if len(time) > 1:
day = int(time[0])
time = time[1]
else:
day = 0
time = time[0]
h, m, s = map(int, time.split(':'))
return day*24 + h + m / 60 + s / 3600
def reformat_data(df_local):
# Remove 'K' and convert to integers
df_local['MaxRSS'] = df_local['MaxRSS'].str.replace('K', '').astype(int)
df_local['MaxVMSize'] = df_local['MaxVMSize'].str.replace('K', '').astype(int)
df_local['Elapsed'] = df_local['Elapsed'].apply(lambda x: (elapsed_to_hours(x)))

# Convert MaxRSS and MaxVMSize from KB to GB
df_local['MaxRSS'] = df_local['MaxRSS'] / (1024 ** 2) # Convert KB to GB
df_local['MaxVMSize'] = df_local['MaxVMSize'] / (1024 ** 2) # Convert KB to GB
return df_local
for i, (name, job_id) in enumerate(job_ids_dict.items()):
if type(job_id)==list:

for i_sub, job_id_ in enumerate(job_id):
df_ = get_sacct_data(job_id_)
df_ = reformat_data(df_)
if i_sub == 0:
df = df_
else:
concat_df = pd.concat([df, df_], axis=0)
df['MaxVMSize'] = concat_df['MaxVMSize'].max()
df['MaxRSS'] = concat_df['MaxRSS'].max()
df['Elapsed'] = concat_df['Elapsed'].sum()
else:
df = get_sacct_data(job_id)
df = reformat_data(df)
df.index = [name]
if i==0:
df_local = df
else:
df_local = pd.concat([df_local, df], axis=0)


return df_local



if __name__ == '__main__':
# run_grn_inference()
# calculate_scores()
# analyse_meta_cells(task_grn_inference_dir='./')
analyse_imputation(task_grn_inference_dir='./')
# analyse_corr_vs_tfmasked_corr(task_grn_inference_dir='./')

16 changes: 14 additions & 2 deletions src/methods/single_omics/scgpt/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,23 @@ functionality:
path: script.py

platforms:
# - type: docker
# image: xueerchen/scgpt:0.1.7
# setup:
# - type: python
# packages: [ gdown ]
- type: docker
image: xueerchen/scgpt:0.1.7
image: openproblems/base_pytorch_nvidia:1.0.0
# TODO: Try to find working installation of flash attention (flash-attn<1.0.5)
setup:
- type: python
packages: [ gdown ]
pypi:
- gdown
- scgpt # Install from PyPI to get dependencies
- type: docker
# Force re-installing from GitHub to get bug fixes
run: pip install --upgrade --no-deps --force-reinstall git+https://github.com/bowang-lab/scGPT.git

- type: native
- type: nextflow
directives:
Expand Down
1 change: 1 addition & 0 deletions src/metrics/regression_1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ def set_global_seed(seed):

def pivot_grn(net):
''' make net to have gene*tf format'''
net = net.drop_duplicates(subset=['target', 'source'])
df_tmp = net.pivot(index='target', columns='source', values='weight')
return df_tmp.fillna(0)

Expand Down
36 changes: 27 additions & 9 deletions src/metrics/script_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,29 @@
def define_par(dataset):

par = {
# - run general models
'run_global_models': False,
'reg_type': 'ridge',
'models_dir': f"resources/grn_models/{dataset}",
'scores_dir': f"resources/scores/{dataset}",

'models': [ 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],
# 'models_dir': f"resources/grn_models/{dataset}",
# 'scores_dir': f"resources/scores/{dataset}",
# 'models': [ 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],


'models_dir': f"../ciim/output/grns/",
'scores_dir': f"../ciim/output/scores/",
'models': [
'pearson_corr',
'grnboost2', 'celloracle', 'scenicplus',
'net_all_celltypes_young_all_batches',
'net_all_celltypes_young_batch_1',
'net_all_celltypes_old_all_batches',
'net_all_celltypes_old_batch_1',
'net_B cells_all_ages_all_batches',
'net_T cells_all_ages_all_batches',
'net_Myeloid cells_all_ages_all_batches',
'net_NK cells_all_ages_all_batches',

],

'global_models': [
'collectri',
Expand Down Expand Up @@ -63,16 +81,16 @@ def define_par(dataset):
# - run consensus
from consensus.script import main as main_consensus

# - run general models
global_models = False


# - run metrics
for dataset in ['op']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson'
# for dataset in ['op', 'replogle2', 'nakatake', 'norman', 'adamson']: #'op', 'replogle2', 'nakatake', 'norman', 'adamson'
for dataset in ['op']:
print('------ ', dataset, '------')
par = define_par(dataset)
os.makedirs(par['scores_dir'], exist_ok=True)
main_consensus(par)
if global_models:
if par['run_global_models']:
par['models'] = par['global_models']
par['models_dir'] = par['global_models_dir']
for binarize in [False]:
Expand Down Expand Up @@ -104,7 +122,7 @@ def define_par(dataset):
df_all = score
else:
df_all = pd.concat([df_all, score])
df_all.to_csv(f"{par['scores_dir']}/{par['layer']}-{max_n_links}-skeleton_{apply_skeleton}-binarize_{binarize}-{par['reg_type']}-global-{global_models}.csv")
df_all.to_csv(f"{par['scores_dir']}/{par['layer']}-{max_n_links}-skeleton_{apply_skeleton}-binarize_{binarize}-{par['reg_type']}-global-{par['run_global_models']}.csv")
print(df_all)
i+=1

Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ library(tibble)

## VIASH START
par <- list(
multiomics_atac = "resources/grn-benchmark/multiomics_atac.h5ad",
annot_peak_database = "resources/grn-benchmark/supp/annot_peak_database.csv"
multiomics_atac = "resources/inference_datasets/op_atac.h5ad",
annot_peak_database = "resources/prior/peak_annotation.csv"
)
## VIASH END

Expand Down
13 changes: 13 additions & 0 deletions src/robustness_analysis/permute_grn/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,19 @@ def main(par):
random_indices = np.random.choice(prediction.index, size=num_to_modify, replace=False)
# 3. Change the sign of the selected rows
prediction.loc[random_indices, 'weight'] *= -1
elif type == 'direction': # change the regulatory sign
# Calculate the number of rows to permute
prediction = prediction.reset_index(drop=True)
n_rows_to_permute = int(len(prediction) * (degree))
# print(n_rows_to_permute)

# Randomly select indices to permute
indices_to_permute = np.random.choice(prediction.index, size=n_rows_to_permute, replace=False)

print(indices_to_permute)
# Swap source and target for the selected rows
prediction.loc[indices_to_permute, ['source', 'target']] = prediction.loc[indices_to_permute, ['target', 'source']].values

elif type == 'binary': # change the regulatory sign
prediction['weight'] = np.where(prediction['weight'] > 0, 1, -1)
else:
Expand Down
Loading

0 comments on commit d2539c6

Please sign in to comment.