Skip to content

Commit

Permalink
small changes to regressions. actions config changed to resources_test
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Oct 9, 2024
1 parent 1180b20 commit f2a6066
Show file tree
Hide file tree
Showing 17 changed files with 624 additions and 1,916 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
id: cache
with:
s3_bucket: $s3_bucket
dest_path: resources
dest_path: resources_test
cache_key_prefix: resources__

- id: ns_list
Expand Down
2,009 changes: 170 additions & 1,839 deletions runs.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/api/comp_method.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ functionality:
example: resources_test/grn_models/collectri.csv
- name: --tf_all
type: file
required: true
required: false
direction: input
example: resources_test/prior/tf_all.csv
- name: --max_n_links
Expand Down
21 changes: 14 additions & 7 deletions src/methods/multi_omics/scenicplus_ns/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ publish_dir: "$publish_dir"
HERE


nextflow run . \
-main-script target/nextflow/workflows/grn_inference_scenicplus/main.nf \
-profile docker \
-with-trace \
-c src/common/nextflow_helpers/labels_ci.config \
-params-file params/${RUN_ID}.yaml
# nextflow run . \
# -main-script target/nextflow/workflows/grn_inference_scenicplus/main.nf \
# -profile docker \
# -with-trace \
# -c src/common/nextflow_helpers/labels_ci.config \
# -params-file params/${RUN_ID}.yaml


# ./tw-windows-x86_64.exe launch `
Expand All @@ -40,5 +40,12 @@ nextflow run . \
# --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
# --params-file ./params/scenicplus.yaml `
# --config src/common/nextflow_helpers/labels_tw.config

./tw launch https://github.com/openproblems-bio/task_grn_inference \
--revision build/main \
--pull-latest \
--main-script target/nextflow/workflows/grn_inference_scenicplus/main.nf \
--workspace 53907369739130 \
--compute-env 6TeIFgV5OY4pJCk8I0bfOh \
--params-file ${param_file} \
--config src/common/nextflow_helpers/labels_tw.config

19 changes: 17 additions & 2 deletions src/methods/multi_omics/scglue/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,13 +264,28 @@ def prune_grn(par):
"--output", f"{par['temp_dir']}/pruned_grn.csv",
"--top_n_targets", str(par['top_n_targets']),
"--rank_threshold", str(par['rank_threshold']),
"--auc_threshold", "0.1",
"--nes_threshold", str(par['nes_threshold']),
"--auc_threshold", "0",
"--nes_threshold", "0",
"--min_genes", "1",
"--thresholds", "0.5", "0.7",
"--top_n_regulators", "10", "50", "100",
"--max_similarity_fdr", "0.1",
"--num_workers", f"{par['num_workers']}",
"--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var
"--gene_attribute", "name"
]

# --thresholds, The first method to create the TF-modules based on the best targets for each transcription factor (default: 0.75 0.90).
# --top_n_targets TOP_N_TARGETS [TOP_N_TARGETS ...]
# The second method is to select the top targets for a given TF. (default: 50)
# --top_n_regulators TOP_N_REGULATORS [TOP_N_REGULATORS ...]
# The alternative way to create the TF-modules is to select the best regulators for each gene. (default: 5 10 50)
# --max_similarity_fdr MAX_SIMILARITY_FDR
# Maximum FDR in motif similarity to use when annotating enriched motifs (default: 0.001)
# --auc_threshold AUC_THRESHOLD
# The threshold used for calculating the AUC of a feature as fraction of ranked genes (default: 0.05).
# --nes_threshold NES_THRESHOLD
# The Normalized Enrichment Score (NES) threshold for finding enriched features (default: 3.0).

result = subprocess.run(command, check=True)
print("Output:")
Expand Down
4 changes: 2 additions & 2 deletions src/methods/multi_omics/scglue/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
"num_workers": 20,
"prediction": "output/scglue_d0_hvg.csv",
"max_n_links": 50000,
"nes_threshold": 1,
"rank_threshold": 5000,
"nes_threshold": 0,
"rank_threshold": 10000,
"top_n_targets": 100,
'normalize': False,
'extend_range': 150000
Expand Down
4 changes: 4 additions & 0 deletions src/metrics/regression_1/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ functionality:
type: string
direction: input
example: resources/prior/skeleton.csv'
- name: --apply_skeleton
type: boolean
direction: input
default: true
resources:
- type: python_script
path: script.py
Expand Down
4 changes: 2 additions & 2 deletions src/metrics/regression_1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,9 @@ def main(par):
gene_names = perturbation_data.var.index.to_numpy()
net = pd.read_csv(par['prediction'])

if True: #apply skeleton
if par['apply_skeleton']: #apply skeleton
print('Before filtering with skeleton:', net.shape)
skeleton = np.savetxt(par['skeleton'], all_links.values, fmt='%s')
skeleton = np.loadtxt(par['skeleton'], dtype=str)
net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
net = net[net['link'].isin(skeleton)]
print('After filtering with skeleton:', net.shape)
Expand Down
3 changes: 2 additions & 1 deletion src/metrics/regression_1/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
'layer': 'scgen_pearson',
'subsample': -2,
'num_workers': 4,
'skeleton': 'resources/prior/skeleton.csv'
'skeleton': 'resources/prior/skeleton.csv',
'apply_skeleton': True
}
## VIASH END
# meta = {
Expand Down
8 changes: 8 additions & 0 deletions src/metrics/regression_2/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,14 @@ functionality:
direction: input
type: boolean
default: true
- name: --skeleton
type: string
direction: input
example: resources/prior/skeleton.csv'
- name: --apply_skeleton
type: boolean
direction: input
default: true

platforms:
- type: docker
Expand Down
2 changes: 1 addition & 1 deletion src/metrics/regression_2/consensus/create-consensus.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
viash run src/metrics/regression_2/consensus/config.novsh.yaml -- --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
--output resources/grn-benchmark/consensus-num-regulators.json \
--output resources/prior/consensus-num-regulators.json \
--grn_folder resources/grn_models/ \
--grns ananse.csv,celloracle.csv,figr.csv,granie.csv,scenicplus.csv,scglue.csv
15 changes: 10 additions & 5 deletions src/metrics/regression_2/consensus/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
# 'models_dir': 'resources/grn-benchmark/grn_models/d0_hvg',
# 'models': [pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle],
'output': 'resources/grn-benchmark/consensus-num-regulators.json'
'output': 'resources/prior/consensus-num-regulators.json'
}
## VIASH END
import argparse
Expand All @@ -34,15 +34,20 @@
gene_names = adata_rna.var.gene.to_numpy()
except:
gene_names = adata_rna.var.index.to_numpy()

gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
print(par['perturbation_data'])
print(par['models'])

# Load inferred GRNs
grns = []
for filename in par['models']:
filepath = os.path.join(par['models_dir'], f'{filename}.csv')
gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
for model in par['models']:
filepath = os.path.join(par['models_dir'], f'{model}.csv')
if not os.path.exists(filepath):
print(f"{filepath} didnt exist. Skipped.")
continue
if model == 'collectri':
print(f"skipping collectri")
continue
A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)
for source, target, weight in zip(df['source'], df['target'], df['weight']):
Expand Down
53 changes: 32 additions & 21 deletions src/metrics/regression_2/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,24 @@

def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarray:
gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
df = pd.read_csv(filepath, sep=',', header='infer')
if 'cell_type' in df.columns:
net = pd.read_csv(filepath, sep=',', header='infer')
if 'cell_type' in net.columns:
verbose_print(par['verbose'], 'Taking mean of cell type specific grns', 3)
df.drop(columns=['cell_type'], inplace=True)
df = df.groupby(['source', 'target']).mean().reset_index()
net.drop(columns=['cell_type'], inplace=True)
net = net.groupby(['source', 'target']).mean().reset_index()
if par['apply_skeleton']: #apply skeleton
print('Before filtering with skeleton:', net.shape)
skeleton = np.loadtxt(par['skeleton'], dtype=str)
net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
net = net[net['link'].isin(skeleton)]
print('After filtering with skeleton:', net.shape)
# keep only top n links
if df.shape[0] > par['max_n_links']:
if net.shape[0] > par['max_n_links']:
verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3)
df = process_links(df, par)
net = process_links(net, par)
# convert to matrix
A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
for source, target, weight in zip(df['source'], df['target'], df['weight']):
for source, target, weight in zip(net['source'], net['target'], net['weight']):
if (source not in gene_dict) or (target not in gene_dict):
continue
i = gene_dict[source]
Expand Down Expand Up @@ -170,7 +176,6 @@ def cross_validate(
if n_features[j] > 0:
res = cross_validate_gene(estimator_t, X, groups, grn, j, n_features=int(n_features[j]),n_jobs=n_jobs)
results.append(res)

return {
'gene_names': list(gene_names),
'scores': list(results)
Expand Down Expand Up @@ -238,19 +243,23 @@ def static_approach(

# Cross-validate each gene using the inferred GRN to define select input features
res = cross_validate(reg_type, gene_names, tf_names, X, groups, grn, n_features, n_jobs=n_jobs)
r2 = []

for i in range(len(res['scores'])):
gene_name = res['gene_names'][i]
if n_features[n_features_dict[gene_name]] != 0:
score = res['scores'][i]['avg-r2']
if clip_scores:
score = np.clip(score, 0, 1)
r2.append(score)
# mean_r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
mean_r2_scores = float(np.mean(r2))

# return np.mean(mean_r2_scores)

# scores
if False: # there is a bug here
r2 = []
for i in range(len(res['scores'])):
gene_name = res['gene_names'][i]
if n_features[n_features_dict[gene_name]] != 0:
score = res['scores'][i]['avg-r2']
if clip_scores:
score = np.clip(score, 0, 1)
r2.append(score)
if len(r2)==0:
raise ValueError('R2 score is empty')
mean_r2_scores = float(np.mean(r2))
else:
r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
mean_r2_scores = np.mean(r2_scores)
return mean_r2_scores


Expand Down Expand Up @@ -309,6 +318,8 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3)
# print(f'Dynamic approach:', flush=True)
verbose_print(par['verbose'], f'Static approach (theta=0):', 3)
# print(np.any(n_features_theta_min!=0))
# aa
score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores)
verbose_print(par['verbose'], f'Static approach (theta=0.5):', 3)
score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores)
Expand Down
21 changes: 15 additions & 6 deletions src/metrics/regression_2/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
par = {
'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
'layer': 'scgen_pearson',
"prediction": "resources/grn_models/donor_0_celltype/grnboost2.csv",
"prediction": "resources/grn_models/full/grnboost2.csv",
'tf_all': 'resources/prior/tf_all.csv',
"max_n_links": 50000,
'consensus': 'resources/prior/consensus-num-regulators.json',
Expand All @@ -20,15 +20,24 @@
'num_workers': 4,
'apply_tf': True,
'clip_scores': True,
'method_id': 'grnboost'
'method_id': 'grnboost',
'apply_skeleton': False,
'skeleton': 'resources/prior/skeleton.csv',
'verbose': 2

}
## VIASH END
# meta = {
# "resources_dir":'src/metrics/regression_1/'
# }

print(par)
sys.path.append(meta['resources_dir'])
try:
sys.path.append(meta['resources_dir'])
except:
meta = {
"resources_dir": 'src/metrics/',
"util": 'src/utils'
}
sys.path.append(meta["resources_dir"])
sys.path.append(meta["util"])
from main import main

if isinstance(par['reg_type'], list) and (len(par['reg_type']) == 1):
Expand Down
54 changes: 34 additions & 20 deletions src/metrics/script_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@
## VIASH START
par = {
'reg_type': 'ridge',
'models_dir': "resources/grn_models/d0_hvg",
'models_dir': "resources/grn_models/",
'scores_dir': "resources/scores/",

'methods': [ 'scenicplus', 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle'],
'methods': [ 'scenicplus', 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],
'layers': ['scgen_pearson', 'lognorm', 'pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm'],
# 'layers': ['scgen_pearson'],

"perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
"tf_all": "resources/prior/tf_all.csv",
'skeleton': 'resources/prior/skeleton.csv',
"max_n_links": 50000,
"apply_tf": "true",
'subsample': -2,
Expand All @@ -23,7 +25,8 @@
'num_workers': 20,
'consensus': 'resources/prior/consensus-num-regulators.json',
'static_only': True,
'clip_scores': True
'clip_scores': True,
'apply_skeleton': False
}
# VIASH END

Expand All @@ -33,6 +36,7 @@
parser.add_argument('--num_workers', type=str, help='Number of cores')
parser.add_argument('--models_dir', type=str, help='where to read the models')
parser.add_argument('--scores_dir', type=str, help='where to write the model')
parser.add_argument('--apply_skeleton', type=str, help='where to apply the skeleton')

args = parser.parse_args()

Expand All @@ -44,6 +48,8 @@
par['models_dir'] = args.models_dir
if args.scores_dir:
par['scores_dir'] = args.scores_dir
if args.apply_skeleton:
par['apply_skeleton'] = args.apply_skeleton

meta = {
"resources_dir": 'src/metrics/',
Expand All @@ -54,21 +60,29 @@

os.makedirs(par['scores_dir'], exist_ok=True)


for layer in par['layers']:
par['layer'] = layer
for i, method in enumerate(par['methods']):
par['prediction'] = f"{par['models_dir']}/{method}.csv"
from regression_1.main import main
reg1 = main(par)
from regression_2.main import main
reg2 = main(par)
score = pd.concat([reg1, reg2], axis=1)
score.index = [method]
if i==0:
df_all = score
else:
df_all = pd.concat([df_all, score])
df_all.to_csv(f"{par['scores_dir']}/{layer}-{par['reg_type']}.csv")
print(df_all)
for dataset_type in ['full', 'hvg']:
for apply_skeleton in [False, True]:
os.makedirs(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}", exist_ok=True)
for layer in par['layers']:
par['layer'] = layer
i = 0
for method in par['methods']:
print(method)
par['prediction'] = f"{par['models_dir']}/{dataset_type}/{method}.csv"
if not os.path.exists(par['prediction']):
print(f"{par['prediction']} doesnt exist. Skipped.")
continue
from regression_1.main import main
reg1 = main(par)
from regression_2.main import main
reg2 = main(par)
score = pd.concat([reg1, reg2], axis=1)
score.index = [method]
if i==0:
df_all = score
else:
df_all = pd.concat([df_all, score])
df_all.to_csv(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}/{layer}-{par['reg_type']}.csv")
print(df_all)
i+=1

Loading

0 comments on commit f2a6066

Please sign in to comment.