bug in regression 1 fixed

openproblems-bio · Aug 25, 2024 · 3651b47 · 3651b47
1 parent dbe0511
commit 3651b47
Show file tree

Hide file tree

Showing 10 changed files with 1,046 additions and 51 deletions.
diff --git a/runs.ipynb b/runs.ipynb
diff --git a/scripts/run_grn_evaluation_tw.sh b/scripts/run_grn_evaluation_tw.sh
@@ -1,15 +1,13 @@
 #!/bin/bash
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
-
-RUN_ID="pearson_gb"
+RUN_ID="pearson_gb_subsample"
 resources_dir="s3://openproblems-data/resources/grn"
 publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"
-# grn_models_folder="${resources_dir}/supplementary/grn_models_noised"
 grn_models_folder="${resources_dir}/grn_models"
 reg_type=GB
 subsample=-2
-max_workers=20
+max_workers=10
 
 param_file="./params/${RUN_ID}.yaml"
 
@@ -77,14 +75,14 @@ HERE
 #   -c src/common/nextflow_helpers/labels_ci.config \
 #   -params-file ${param_file}
 
-./tw-windows-x86_64.exe launch `
-    https://github.com/openproblems-bio/task_grn_benchmark.git `
-    --revision build/main `
-    --pull-latest `
-    --main-script target/nextflow/workflows/run_grn_evaluation/main.nf `
-    --workspace 53907369739130 `
-    --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
-    --params-file ./params/scgen_pearson_gb_pcs.yaml `
-    --config src/common/nextflow_helpers/labels_tw.config
+# ./tw-windows-x86_64.exe launch `
+#     https://github.com/openproblems-bio/task_grn_benchmark.git `
+#     --revision build/main `
+#     --pull-latest `
+#     --main-script target/nextflow/workflows/run_grn_evaluation/main.nf `
+#     --workspace 53907369739130 `
+#     --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
+#     --params-file ./params/scgen_pearson_gb_pcs.yaml `
+#     --config src/common/nextflow_helpers/labels_tw.config
 
 
diff --git a/scripts/run_pc_vs_nc.sh b/scripts/run_pc_vs_nc.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
+
+subsamples=(-2 -3 -4)
+
+RUN_ID="robust_analy_$1"
+resources_dir="resources"
+publish_dir="output/${RUN_ID}"
+
+# resources_dir="s3://openproblems-data/resources/grn"
+# publish_dir="s3://openproblems-data/resources/grn/results/${RUN_ID}"
+
+grn_models_folder="${resources_dir}/grn_models"
+
+
+reg_type=ridge
+max_workers=10
+layer=pearson
+
+param_file="./params/${RUN_ID}.yaml"
+
+grn_names=(
+    "collectri"
+    "celloracle"
+    "scenicplus"
+    "figr"
+    "granie"
+    "scglue"
+)
+
+
+
+# Start writing to the YAML file
+cat > $param_file << HERE
+param_list:
+HERE
+
+append_entry() {
+  cat >> $param_file << HERE
+  - id: ${1}_${2}
+    perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
+    layer: ${layer}
+    reg_type: $reg_type
+    method_id: ${2}-${1}
+    subsample: $2
+    max_workers: $max_workers
+    consensus: ${resources_dir}/prior/consensus-num-regulators.json
+    prediction: ${grn_models_folder}/$1.csv
+    degree: 0
+
+HERE
+}
+# Loop through grn_names and layers
+for subsample in "${subsamples[@]}"; do
+    for grn_name in "${grn_names[@]}"; do
+        append_entry "$grn_name" "$subsample" 
+    done
+done
+
+
+
+# Append the remaining output_state and publish_dir to the YAML file
+cat >> $param_file << HERE
+output_state: "state.yaml"
+publish_dir: "$publish_dir"
+HERE
+
+nextflow run . \
+  -main-script  target/nextflow/workflows/run_robustness_analysis/main.nf \
+  -profile docker \
+  -with-trace \
+  -c src/common/nextflow_helpers/labels_ci.config \
+  -params-file ${param_file}
+
+# ./tw-windows-x86_64.exe launch `
+#     https://github.com/openproblems-bio/task_grn_benchmark.git `
+#     --revision build/main `
+#     --pull-latest `
+#     --main-script target/nextflow/workflows/run_grn_evaluation/main.nf `
+#     --workspace 53907369739130 `
+#     --compute-env 6TeIFgV5OY4pJCk8I0bfOh `
+#     --params-file ./params/scgen_pearson_gb_pcs.yaml `
+#     --config src/common/nextflow_helpers/labels_tw.config
+
+
diff --git a/scripts/run_robust_analys.sh b/scripts/run_robust_analys.sh
@@ -2,7 +2,11 @@
 
 # RUN_ID="run_$(date +%Y-%m-%d_%H-%M-%S)"
 
-RUN_ID="robust_analy"
+degrees=(0 10 20 50 100)
+noise_type="$1"
+echo $noise_type
+
+RUN_ID="robust_analy_$1"
 resources_dir="resources"
 publish_dir="output/${RUN_ID}"
 
@@ -19,7 +23,6 @@ layer=pearson
 
 param_file="./params/${RUN_ID}.yaml"
 
-
 grn_names=(
     "collectri"
     "celloracle"
@@ -29,8 +32,7 @@ grn_names=(
     "scglue"
 )
 
-degrees=(10 20 50 100)
-types=(links weight)
+
 
 # Start writing to the YAML file
 cat > $param_file << HERE
@@ -39,28 +41,27 @@ HERE
 
 append_entry() {
   cat >> $param_file << HERE
-  - id: ${1}_${2}_${3}
+  - id: ${1}_${2}
     perturbation_data: ${resources_dir}/grn-benchmark/perturbation_data.h5ad
     layer: ${layer}
     reg_type: $reg_type
-    method_id: $1
+    method_id: ${2}-${1}
     subsample: $subsample
     max_workers: $max_workers
     consensus: ${resources_dir}/prior/consensus-num-regulators.json
     prediction: ${grn_models_folder}/$1.csv
-    degree: ${3}
-    type: ${2}
+    degree: ${2}
+    noise_type: ${noise_type}
 HERE
 }
 # Loop through grn_names and layers
-for type in "${types[@]}"; do
-    for degree in "${degrees[@]}"; do
-        for grn_name in "${grn_names[@]}"; do
-            append_entry "$grn_name" "$type" "$degree" 
-        done
+for degree in "${degrees[@]}"; do
+    for grn_name in "${grn_names[@]}"; do
+        append_entry "$grn_name" "$degree" 
     done
 done
 
+
 # Append the remaining output_state and publish_dir to the YAML file
 cat >> $param_file << HERE
 output_state: "state.yaml"

diff --git a/src/methods/multi_omics/scglue/main.py b/src/methods/multi_omics/scglue/main.py
@@ -27,8 +27,6 @@ def preprocess(rna, atac, par):
     sc.pp.neighbors(atac, use_rep="X_lsi", metric="cosine")
     sc.tl.umap(atac)
     print('step 2 completed')
-
-
 
     scglue.data.get_gene_annotation(
         rna, gtf=par['annotation_file'],

diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
@@ -181,8 +181,6 @@ def main(par):
     reg_type = par['reg_type']
     max_workers = par['max_workers']
     layer = par["layer"]
-    pert_df = pd.DataFrame(perturbation_data.layers[layer], columns=gene_names)
-
     if subsample == -1:
         pass
     elif subsample == -2: # one combination of cell_type, sm_name
@@ -192,11 +190,18 @@ def main(par):
         for _, row in obs.iterrows():
             mask.append((sampled_obs==row).all(axis=1).any())  
         perturbation_data = perturbation_data[mask,:]
+    elif subsample == -3: #negative control
+        mask = perturbation_data.obs.sm_name == 'Dimethyl Sulfoxide'
+        perturbation_data = perturbation_data[mask,:]
+    elif subsample == -4: #positive control
+        mask = perturbation_data.obs.sm_name.isin(['Dabrafenib', 'Belinostat'])
+        perturbation_data = perturbation_data[mask,:]
     else:
         perturbation_data = perturbation_data[np.random.choice(perturbation_data.n_obs, subsample, replace=False), :]
 
     print(perturbation_data.shape)
 
+    pert_df = pd.DataFrame(perturbation_data.layers[layer], columns=gene_names)
     pert_df = pert_df.T  # make it gene*sample
 
     # process net

diff --git a/src/robustness_analysis/script.py b/src/robustness_analysis/script.py
@@ -7,28 +7,74 @@
   "prediction": "resources/grn_models/collectri.csv",
   "prediction_n": "output/grn_noised.csv",
   'degree': 20,
-  'type': 'links'
+  'noise_type': 'links'
 }
 
 ## VIASH END
 
 degree = par['degree']/100
+type = par['noise_type']
+
 
 prediction = pd.read_csv(par['prediction'])
-assert 'weight' in prediction.columns 
-
-if type =='weight':
-    print('Add noise to weight')
-    std_dev = prediction['weight'].std()
-    noise = np.random.normal(0, degree * std_dev, size=prediction['weight'].shape)
-    prediction['weight'] += noise
-
-elif type =='links':
-    print('Permute links')
-    num_rows_to_permute = int(len(prediction) * degree)
-    permute_indices = np.random.choice(prediction.index, size=num_rows_to_permute, replace=False)
+
+
+if type == 'weight': # add noise to weight
+  assert 'weight' in prediction.columns 
+  print('Add noise to weight')
+  std_dev = prediction['weight'].std()
+  noise = np.random.normal(0, degree * std_dev, size=prediction['weight'].shape)
+  prediction['weight'] += noise
+
+elif type == 'links': # shuffle source-target-weight
+  print('Permute links')
+  num_rows_to_permute = int(len(prediction) * degree)
+  permute_indices = np.random.choice(prediction.index, size=num_rows_to_permute, replace=False)
+  prediction.loc[permute_indices, 'weight'] = np.random.permutation(prediction.loc[permute_indices, 'weight'].values)
+
+elif type == 'net': # shuffle source-target matrix
+  print('Permute links')
 
-    prediction.loc[permute_indices, 'weight'] = np.random.permutation(prediction.loc[permute_indices, 'weight'].values)
+  # 1. Pivot the GRN with target as index and source as columns
+  pivot_df = prediction.pivot(index='target', columns='source', values='weight')
+
+  # Fill NaNs with 0 or a value of your choice
+  pivot_df.fillna(0, inplace=True)
+
+  # 2. Randomly choose 20% of the matrix to shuffle
+  matrix_flattened = pivot_df.values.flatten()
+  n_elements = len(matrix_flattened)
+  n_shuffle = int(n_elements * degree)
+
+  # Randomly select 20% of the matrix elements' indices
+  shuffle_indices = np.random.choice(n_elements, n_shuffle, replace=False)
+
+  # Get the values that will be shuffled
+  shuffle_values = matrix_flattened[shuffle_indices]
+
+  # 3. Shuffle the selected values
+  np.random.shuffle(shuffle_values)
+
+  # Assign the shuffled values back to the selected positions
+  matrix_flattened[shuffle_indices] = shuffle_values
+
+  # Reshape the flattened array back into the matrix
+  pivot_df_shuffled = pd.DataFrame(matrix_flattened.reshape(pivot_df.shape), 
+                                  index=pivot_df.index, 
+                                  columns=pivot_df.columns)
+
+  flat_df = pivot_df_shuffled.reset_index()
+
+  # Melt the DataFrame to turn it back into long-form (source-target-weight)
+  prediction = flat_df.melt(id_vars='target', var_name='source', value_name='weight')
+
+
+  prediction = prediction[prediction['weight'] !=0 ].reset_index(drop=True)
+
+
+else:
+  raise ValueError(f'Wrong type ({type}) for adding noise')
+
 print('Output noised GRN')
 prediction.to_csv(par['prediction_n'])
 
diff --git a/src/workflows/run_grn_evaluation/main.nf b/src/workflows/run_grn_evaluation/main.nf
@@ -14,8 +14,7 @@ workflow run_wf {
 
   // construct list of metrics
   metrics = [
-    regression_1,
-    regression_2
+    regression_1
   ]
 
   /***************************

diff --git a/src/workflows/run_robustness_analysis/config.vsh.yaml b/src/workflows/run_robustness_analysis/config.vsh.yaml
@@ -44,7 +44,7 @@ functionality:
           required: false
           direction: input
           default: 20
-        - name: --type
+        - name: --noise_type
           type: string
           required: false
           direction: input

diff --git a/src/workflows/run_robustness_analysis/main.nf b/src/workflows/run_robustness_analysis/main.nf
@@ -27,7 +27,7 @@ workflow run_wf {
 
     | noise_grn.run(
       fromState: [
-        prediction: "prediction"
+        prediction: "prediction", degree: "degree", noise_type: "noise_type"
       ],
       toState: [
           prediction_n: "prediction_n"
@@ -49,9 +49,7 @@ workflow run_wf {
         reg_type: "reg_type",
         method_id: "method_id",
         max_workers: "max_workers",
-        consensus: "consensus",
-        degree: "degree",
-        type: "type"
+        consensus: "consensus"
       ],
       // use 'toState' to publish that component's outputs to the overall state
       toState: { id, output, state, comp ->