small changes to regressions. actions config changed to resources_test

openproblems-bio · Oct 9, 2024 · f2a6066 · f2a6066
1 parent 1180b20
commit f2a6066
Show file tree

Hide file tree

Showing 17 changed files with 624 additions and 1,916 deletions.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -41,7 +41,7 @@ jobs:
       id: cache
       with:
         s3_bucket: $s3_bucket
-        dest_path: resources
+        dest_path: resources_test
         cache_key_prefix: resources__
 
     - id: ns_list

diff --git a/runs.ipynb b/runs.ipynb
diff --git a/src/api/comp_method.yaml b/src/api/comp_method.yaml
@@ -25,7 +25,7 @@ functionality:
       example: resources_test/grn_models/collectri.csv
     - name: --tf_all
       type: file
-      required: true
+      required: false
       direction: input
       example: resources_test/prior/tf_all.csv
     - name: --max_n_links

diff --git a/src/methods/multi_omics/scenicplus_ns/run.sh b/src/methods/multi_omics/scenicplus_ns/run.sh
@@ -23,12 +23,12 @@ publish_dir: "$publish_dir"
 HERE
 
 
-nextflow run . \
-  -main-script  target/nextflow/workflows/grn_inference_scenicplus/main.nf \
-  -profile docker \
-  -with-trace \
-  -c src/common/nextflow_helpers/labels_ci.config \
-  -params-file params/${RUN_ID}.yaml
+# nextflow run . \
+#   -main-script  target/nextflow/workflows/grn_inference_scenicplus/main.nf \
+#   -profile docker \
+#   -with-trace \
+#   -c src/common/nextflow_helpers/labels_ci.config \
+#   -params-file params/${RUN_ID}.yaml
 
 
 # ./tw-windows-x86_64.exe launch `
@@ -40,5 +40,12 @@ nextflow run . \
 #       --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
 #       --params-file ./params/scenicplus.yaml `
 #       --config src/common/nextflow_helpers/labels_tw.config
-
+./tw launch https://github.com/openproblems-bio/task_grn_inference \
+  --revision build/main \
+  --pull-latest \
+  --main-script target/nextflow/workflows/grn_inference_scenicplus/main.nf \
+  --workspace 53907369739130 \
+  --compute-env 6TeIFgV5OY4pJCk8I0bfOh \
+  --params-file ${param_file} \
+  --config src/common/nextflow_helpers/labels_tw.config
 
diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py
@@ -264,13 +264,28 @@ def prune_grn(par):
         "--output", f"{par['temp_dir']}/pruned_grn.csv",
         "--top_n_targets", str(par['top_n_targets']),
         "--rank_threshold", str(par['rank_threshold']),
-        "--auc_threshold", "0.1",
-        "--nes_threshold", str(par['nes_threshold']), 
+        "--auc_threshold", "0",
+        "--nes_threshold", "0", 
         "--min_genes", "1",
+        "--thresholds", "0.5", "0.7", 
+        "--top_n_regulators", "10", "50", "100",
+        "--max_similarity_fdr", "0.1",
         "--num_workers", f"{par['num_workers']}",
         "--cell_id_attribute", "obs_id", # be sure that obs_id is in obs and name is in var
         "--gene_attribute", "name"
     ]
+
+#       --thresholds, The first method to create the TF-modules based on the best targets for each transcription factor (default: 0.75 0.90).
+#   --top_n_targets TOP_N_TARGETS [TOP_N_TARGETS ...]
+#                         The second method is to select the top targets for a given TF. (default: 50)
+# --top_n_regulators TOP_N_REGULATORS [TOP_N_REGULATORS ...]
+#                         The alternative way to create the TF-modules is to select the best regulators for each gene. (default: 5 10 50)
+# --max_similarity_fdr MAX_SIMILARITY_FDR
+#                         Maximum FDR in motif similarity to use when annotating enriched motifs (default: 0.001)
+# --auc_threshold AUC_THRESHOLD
+#                         The threshold used for calculating the AUC of a feature as fraction of ranked genes (default: 0.05).
+# --nes_threshold NES_THRESHOLD
+#                         The Normalized Enrichment Score (NES) threshold for finding enriched features (default: 3.0).
 
     result = subprocess.run(command, check=True)
     print("Output:")

diff --git a/src/methods/multi_omics/scglue/script.py b/src/methods/multi_omics/scglue/script.py
@@ -11,8 +11,8 @@
   "num_workers": 20,
   "prediction": "output/scglue_d0_hvg.csv",
   "max_n_links": 50000,
-  "nes_threshold": 1,
-  "rank_threshold": 5000,
+  "nes_threshold": 0,
+  "rank_threshold": 10000,
   "top_n_targets": 100,
   'normalize': False,
   'extend_range': 150000

diff --git a/src/metrics/regression_1/config.vsh.yaml b/src/metrics/regression_1/config.vsh.yaml
@@ -23,6 +23,10 @@ functionality:
       type: string
       direction: input
       example: resources/prior/skeleton.csv'
+    - name: --apply_skeleton 
+      type: boolean
+      direction: input
+      default: true
   resources:
     - type: python_script
       path: script.py

diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
@@ -207,9 +207,9 @@ def main(par):
     gene_names = perturbation_data.var.index.to_numpy()
     net = pd.read_csv(par['prediction'])
 
-    if True: #apply skeleton
+    if par['apply_skeleton']: #apply skeleton
         print('Before filtering with skeleton:', net.shape)
-        skeleton = np.savetxt(par['skeleton'], all_links.values, fmt='%s')
+        skeleton = np.loadtxt(par['skeleton'], dtype=str)
         net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
         net = net[net['link'].isin(skeleton)]
         print('After filtering with skeleton:', net.shape)

diff --git a/src/metrics/regression_1/script.py b/src/metrics/regression_1/script.py
@@ -17,7 +17,8 @@
   'layer': 'scgen_pearson',
   'subsample': -2,
   'num_workers': 4,
-  'skeleton': 'resources/prior/skeleton.csv'
+  'skeleton': 'resources/prior/skeleton.csv',
+  'apply_skeleton': True
 }
 ## VIASH END
 # meta = {

diff --git a/src/metrics/regression_2/config.vsh.yaml b/src/metrics/regression_2/config.vsh.yaml
@@ -23,6 +23,14 @@ functionality:
       direction: input
       type: boolean
       default: true
+    - name: --skeleton 
+      type: string
+      direction: input
+      example: resources/prior/skeleton.csv'
+    - name: --apply_skeleton 
+      type: boolean
+      direction: input
+      default: true
 
 platforms:
   - type: docker

diff --git a/src/metrics/regression_2/consensus/create-consensus.sh b/src/metrics/regression_2/consensus/create-consensus.sh
@@ -1,4 +1,4 @@
 viash run src/metrics/regression_2/consensus/config.novsh.yaml -- --perturbation_data resources/grn-benchmark/perturbation_data.h5ad \
-     --output resources/grn-benchmark/consensus-num-regulators.json \
+     --output resources/prior/consensus-num-regulators.json \
      --grn_folder resources/grn_models/ \
      --grns ananse.csv,celloracle.csv,figr.csv,granie.csv,scenicplus.csv,scglue.csv
diff --git a/src/metrics/regression_2/consensus/script.py b/src/metrics/regression_2/consensus/script.py
@@ -13,7 +13,7 @@
     'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
     # 'models_dir': 'resources/grn-benchmark/grn_models/d0_hvg',
     # 'models': [pearson_corr, pearson_causal, portia, ppcor, genie3, grnboost2, scenic, scglue, celloracle],
-    'output': 'resources/grn-benchmark/consensus-num-regulators.json'
+    'output': 'resources/prior/consensus-num-regulators.json'
 }
 ## VIASH END
 import argparse
@@ -34,15 +34,20 @@
     gene_names = adata_rna.var.gene.to_numpy()
 except:
     gene_names = adata_rna.var.index.to_numpy()
-
+gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
 print(par['perturbation_data'])
 print(par['models'])
 
 # Load inferred GRNs
 grns = []
-for filename in par['models']:
-    filepath = os.path.join(par['models_dir'], f'{filename}.csv')
-    gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
+for model in par['models']:
+    filepath = os.path.join(par['models_dir'], f'{model}.csv')
+    if not os.path.exists(filepath):
+        print(f"{filepath} didnt exist. Skipped.")
+        continue 
+    if model == 'collectri':
+        print(f"skipping collectri")
+        continue
     A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
     df = pd.read_csv(filepath, sep=',', header='infer', index_col=0)
     for source, target, weight in zip(df['source'], df['target'], df['weight']):

diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
@@ -20,18 +20,24 @@
 
 def load_grn(filepath: str, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarray:
     gene_dict = {gene_name: i for i, gene_name in enumerate(gene_names)}
-    df = pd.read_csv(filepath, sep=',', header='infer')
-    if 'cell_type' in df.columns:
+    net = pd.read_csv(filepath, sep=',', header='infer')
+    if 'cell_type' in net.columns:
         verbose_print(par['verbose'], 'Taking mean of cell type specific grns', 3)
-        df.drop(columns=['cell_type'], inplace=True)
-        df = df.groupby(['source', 'target']).mean().reset_index()
+        net.drop(columns=['cell_type'], inplace=True)
+        net = net.groupby(['source', 'target']).mean().reset_index()
+    if par['apply_skeleton']: #apply skeleton
+        print('Before filtering with skeleton:', net.shape)
+        skeleton = np.loadtxt(par['skeleton'], dtype=str)
+        net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
+        net = net[net['link'].isin(skeleton)]
+        print('After filtering with skeleton:', net.shape)
     # keep only top n links
-    if df.shape[0] > par['max_n_links']:
+    if net.shape[0] > par['max_n_links']:
         verbose_print(par['verbose'], f"Reducing number of links to {par['max_n_links']}", 3)
-        df = process_links(df, par)
+        net = process_links(net, par)
     # convert to matrix
     A = np.zeros((len(gene_names), len(gene_names)), dtype=float)
-    for source, target, weight in zip(df['source'], df['target'], df['weight']):
+    for source, target, weight in zip(net['source'], net['target'], net['weight']):
         if (source not in gene_dict) or (target not in gene_dict):
             continue
         i = gene_dict[source]
@@ -170,7 +176,6 @@ def cross_validate(
         if n_features[j] > 0:
             res = cross_validate_gene(estimator_t, X, groups, grn, j, n_features=int(n_features[j]),n_jobs=n_jobs)
             results.append(res)
-
     return {
         'gene_names': list(gene_names),
         'scores': list(results)
@@ -238,19 +243,23 @@ def static_approach(
 
     # Cross-validate each gene using the inferred GRN to define select input features
     res = cross_validate(reg_type, gene_names, tf_names, X, groups, grn, n_features, n_jobs=n_jobs)
-    r2 = []
-
-    for i in range(len(res['scores'])):
-        gene_name = res['gene_names'][i]
-        if n_features[n_features_dict[gene_name]] != 0:
-            score = res['scores'][i]['avg-r2']
-            if clip_scores:
-               score = np.clip(score, 0, 1)
-            r2.append(score)
-    # mean_r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
-    mean_r2_scores = float(np.mean(r2))
-
-    # return np.mean(mean_r2_scores)
+
+    # scores
+    if False: # there is a bug here
+        r2 = []
+        for i in range(len(res['scores'])):
+            gene_name = res['gene_names'][i]
+            if n_features[n_features_dict[gene_name]] != 0:
+                score = res['scores'][i]['avg-r2']
+                if clip_scores:
+                score = np.clip(score, 0, 1)
+                r2.append(score)
+        if len(r2)==0:
+            raise ValueError('R2 score is empty')
+        mean_r2_scores = float(np.mean(r2))
+    else:
+        r2_scores = np.asarray([res['scores'][j]['avg-r2'] for j in range(len(res['scores']))])
+        mean_r2_scores = np.mean(r2_scores)
     return mean_r2_scores
 
 
@@ -309,6 +318,8 @@ def main(par: Dict[str, Any]) -> pd.DataFrame:
     verbose_print(par['verbose'], f'Compute metrics for layer: {layer}', 3)
     # print(f'Dynamic approach:', flush=True)
     verbose_print(par['verbose'], f'Static approach (theta=0):', 3)
+    # print(np.any(n_features_theta_min!=0))
+    # aa
     score_static_min = static_approach(grn, n_features_theta_min, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores)
     verbose_print(par['verbose'], f'Static approach (theta=0.5):', 3)
     score_static_median = static_approach(grn, n_features_theta_median, X, groups, gene_names, tf_names, par['reg_type'], n_jobs=par['num_workers'], n_features_dict=n_features_dict, clip_scores=clip_scores)

diff --git a/src/metrics/regression_2/script.py b/src/metrics/regression_2/script.py
@@ -8,7 +8,7 @@
 par = {
     'perturbation_data': 'resources/grn-benchmark/perturbation_data.h5ad',
     'layer': 'scgen_pearson',
-    "prediction": "resources/grn_models/donor_0_celltype/grnboost2.csv",
+    "prediction": "resources/grn_models/full/grnboost2.csv",
     'tf_all': 'resources/prior/tf_all.csv',
     "max_n_links": 50000,
     'consensus': 'resources/prior/consensus-num-regulators.json',
@@ -20,15 +20,24 @@
     'num_workers': 4,
     'apply_tf': True,
     'clip_scores': True,
-    'method_id': 'grnboost'
+    'method_id': 'grnboost',
+    'apply_skeleton': False,
+    'skeleton': 'resources/prior/skeleton.csv',
+    'verbose': 2
 
 }
 ## VIASH END
-# meta = {
-#   "resources_dir":'src/metrics/regression_1/'
-# }
+
 print(par)
-sys.path.append(meta['resources_dir'])
+try:
+    sys.path.append(meta['resources_dir'])
+except:
+    meta = {
+    "resources_dir": 'src/metrics/',
+    "util": 'src/utils'
+    }
+    sys.path.append(meta["resources_dir"])
+    sys.path.append(meta["util"])
 from main import main
 
 if isinstance(par['reg_type'], list) and (len(par['reg_type']) == 1):

diff --git a/src/metrics/script_all.py b/src/metrics/script_all.py
@@ -7,14 +7,16 @@
 ## VIASH START
 par = {
   'reg_type': 'ridge',
-  'models_dir': "resources/grn_models/d0_hvg",
+  'models_dir': "resources/grn_models/",
   'scores_dir': "resources/scores/",
 
-  'methods': [ 'scenicplus', 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle'],
+  'methods': [ 'scenicplus', 'collectri', 'negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],
   'layers': ['scgen_pearson', 'lognorm', 'pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm'],
+  # 'layers': ['scgen_pearson'],
 
   "perturbation_data": "resources/grn-benchmark/perturbation_data.h5ad",
   "tf_all": "resources/prior/tf_all.csv",
+  'skeleton': 'resources/prior/skeleton.csv', 
   "max_n_links": 50000,
   "apply_tf": "true",
   'subsample': -2,
@@ -23,7 +25,8 @@
   'num_workers': 20,
   'consensus': 'resources/prior/consensus-num-regulators.json',
   'static_only': True,
-  'clip_scores': True
+  'clip_scores': True,
+  'apply_skeleton': False
 }
 # VIASH END
 
@@ -33,6 +36,7 @@
 parser.add_argument('--num_workers', type=str, help='Number of cores')
 parser.add_argument('--models_dir', type=str, help='where to read the models')
 parser.add_argument('--scores_dir', type=str, help='where to write the model')
+parser.add_argument('--apply_skeleton', type=str, help='where to apply the skeleton')
 
 args = parser.parse_args()
 
@@ -44,6 +48,8 @@
   par['models_dir'] = args.models_dir
 if args.scores_dir:
   par['scores_dir'] = args.scores_dir
+if args.apply_skeleton:
+  par['apply_skeleton'] = args.apply_skeleton
 
 meta = {
   "resources_dir": 'src/metrics/',
@@ -54,21 +60,29 @@
 
 os.makedirs(par['scores_dir'], exist_ok=True)
 
-
-for layer in par['layers']:
-  par['layer'] = layer
-  for i, method in enumerate(par['methods']):
-    par['prediction'] = f"{par['models_dir']}/{method}.csv"
-    from regression_1.main import main 
-    reg1 = main(par)
-    from regression_2.main import main 
-    reg2 = main(par)
-    score = pd.concat([reg1, reg2], axis=1)
-    score.index = [method]
-    if i==0:
-      df_all = score
-    else:
-      df_all = pd.concat([df_all, score])
-    df_all.to_csv(f"{par['scores_dir']}/{layer}-{par['reg_type']}.csv")
-    print(df_all)
+for dataset_type in ['full', 'hvg']:
+  for apply_skeleton in [False, True]:
+    os.makedirs(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}", exist_ok=True)
+    for layer in par['layers']:
+      par['layer'] = layer
+      i = 0
+      for method in par['methods']:
+        print(method)
+        par['prediction'] = f"{par['models_dir']}/{dataset_type}/{method}.csv"
+        if not os.path.exists(par['prediction']):
+          print(f"{par['prediction']} doesnt exist. Skipped.")
+          continue
+        from regression_1.main import main 
+        reg1 = main(par)
+        from regression_2.main import main 
+        reg2 = main(par)
+        score = pd.concat([reg1, reg2], axis=1)
+        score.index = [method]
+        if i==0:
+          df_all = score
+        else:
+          df_all = pd.concat([df_all, score])
+        df_all.to_csv(f"{par['scores_dir']}/{dataset_type}/skeleton_{apply_skeleton}/{layer}-{par['reg_type']}.csv")
+        print(df_all)
+        i+=1