openproblems-bio · ghar1821 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024
diff --git a/scripts/run_benchmark/run_test_local.sh b/scripts/run_benchmark/run_test_local.sh
@@ -24,6 +24,5 @@ nextflow run . \
   --input_unintegrated_censored resources_test/task_cyto_batch_integration/starter_file/unintegrated_censored.h5ad \
   --input_unintegrated resources_test/task_cyto_batch_integration/starter_file/unintegrated.h5ad \
   --input_validation resources_test/task_cyto_batch_integration/starter_file/validation.h5ad \
-  --samples_to_compare "Tube1_Batch1_WT;Tube1_Batch2_WT" \
   --output_state state.yaml \
   --publish_dir "$publish_dir"
diff --git a/src/metrics/emd_per_samples/config.vsh.yaml → src/metrics/emd/config.vsh.yaml b/src/metrics/emd_per_samples/config.vsh.yaml → src/metrics/emd/config.vsh.yaml
@@ -7,19 +7,19 @@ __merge__: ../../api/comp_metric.yaml
 
 # A unique identifier for your component (required).
 # Can contain only lowercase letters or underscores.
-name: emd_per_samples
+name: emd
 
 # Metadata for your component
 info:
   metrics:
       # A unique identifier for your metric (required).
       # Can contain only lowercase letters or underscores.
-    - name: emd_per_samples
+    - name: emd
       # A relatively short label, used when rendering visualisarions (required)
-      label: EMD Per Samples
+      label: EMD
       # A one sentence summary of how this metric works (required). Used when 
       # rendering summary tables.
-      summary: "Earth Mover Distance to compute differences in marker expression across two samples."
+      summary: "Earth Mover Distance to compute differences in distribution of marker expressions."
       # A multi-line description of how this component works (required). Used
       # when rendering reference documentation.
       description: |
@@ -39,37 +39,32 @@ info:
       max: .inf
       # Whether a higher value represents a 'better' solution (required)
       maximize: false
+  # Keep this just for future metrics' reference if required
   # Note: need this if we have component specific argument with no default.
   # When running the actual command, either split the sample name by ;
   # so Tube1_Batch1_WT;Tube1_Batch2_WT
   # or repeat the flag twice. So --samples_to_compare Tube1_Batch1_WT
   # --samples_to_compare Tube1_Batch2_WT
-  test_setup:
-    starter_file:
-      samples_to_compare: 
-        - Mouse1_Batch1_WT
-        - Mouse1_Batch2_WT
+  # test_setup:
+  #   starter_file:
+  #     samples_to_compare: 
+  #       - Tube1_Batch1_WT
+  #       - Tube1_Batch2_WT
 
 # Component-specific parameters (optional)
-arguments:
-  - name: "--samples_to_compare"
-    type: "string"
-    description: 2 samples to compare.
-    required: true
-    multiple: true
-  - name: "--layer"
-    type: "string"
-    default: "integrated"
-    description: The layer in input anndata containing the marker expression
+# Keep for now for future metrics' reference if required
+# arguments:
+#   - name: "--samples_to_compare"
+#     type: "string"
+#     description: 2 samples to compare.
+#     required: true
+#     multiple: true
 
 # Resources required to run the component
 resources:
   # The script of your component (required)
   - type: python_script
     path: script.py
-  # Additional resources your script needs (optional)
-  # - type: file
-  #   path: weights.pt
 
 engines:
   # Specifications for the Docker image for this component.
@@ -79,7 +74,7 @@ engines:
     # https://viash.io/reference/config/engines/docker/#setup .
     setup:
       - type: python
-        packages: [anndata]
+        packages: [anndata, numpy, pandas]
         github: [TarikExner/CytoNormPy]
 
 runners:

diff --git a/src/metrics/emd/script.py b/src/metrics/emd/script.py
@@ -0,0 +1,161 @@
+from collections import defaultdict
+
+import anndata as ad
+import cytonormpy as cnp
+import numpy as np
+import pandas as pd
+
+## VIASH START
+# Note: this section is auto-generated by viash at runtime. To edit it, make changes
+# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
+par = {
+    "input_integrated": "resources_test/task_cyto_batch_integration/starter_file/integrated.h5ad",
+    "input_unintegrated": "resources_test/task_cyto_batch_integration/starter_file/unintegrated.h5ad",
+    "input_validation": "resources_test/task_cyto_batch_integration/starter_file/validation.h5ad",
+    "output": "output.h5ad",
+}
+meta = {"name": "emd_per_samples"}
+## VIASH END
+
+print("Reading input files", flush=True)
+
+input_integrated = ad.read_h5ad(par["input_integrated"])
+input_unintegrated = ad.read_h5ad(par["input_unintegrated"])
+input_validation = ad.read_h5ad(par["input_validation"])
+
+# add all obs columns in the unintegrated data to integrated data
+# loc should order the obs based on obs_names
+input_integrated.obs = input_unintegrated.obs.loc[input_integrated.obs_names]
+
+# TODO uncomment me if you want to have some samples in validation but not in integrated
+# input_validation = input_integrated[
+#     input_integrated.obs["sample"] == "Tube1_Batch2_WT"
+# ].copy()
+# input_validation.layers["preprocessed"] = input_validation.layers["integrated"]
+# del input_validation.layers["integrated"]
+# input_integrated = input_integrated[
+#     input_integrated.obs["sample"].isin(
+#         ["Tube1_Batch1_WT", "Tube2_Batch1_KO", "Tube2_Batch2_KO"]
+#     )
+# ].copy()
+
+markers_to_assess = input_unintegrated.var[
+    input_unintegrated.var["to_correct"]
+].index.to_numpy()
+
+print("Extracting samples to compute the metric for", flush=True)
+
+# Perhaps overengineering this but the idea is to get all the sample names from
+# the integrated data, get the donors name from the combination of input_validation
+# and input_unintegrated. Why? because depending on how we structure the input
+# for an algorithm we may include matching samples from same donor (which will then
+# appear in input_unintegrated) or not (which will then only appear in input_validation).
+# then get all samples belong to that donor, double check the batch id,
+# and compute the emd across the matching samples
+
+# get all donors-sample map from data
+# for some dataset "scheme", matching samples from the same donor may not be given
+# as an input to the batch correction algorithm.
+# thus they will only exist in validation. we need this for each donor processed
+# by the batch correction algorithm.
+# Assumption here is that for a given donor, there must be at least 1 sample
+# processed by the batch correction algorithm, i.e., in uncorrected_censored (or uncorrected).
+sample_donor_map_arr = np.concatenate(
+    (
+        np.unique(
+            input_validation.obs[["sample", "donor"]].to_numpy().astype(str), axis=0
+        ),
+        np.unique(
+            input_integrated.obs[["sample", "donor"]].to_numpy().astype(str), axis=0
+        ),
+    )
+)
+# keep just the donors which samples were processed by the batch correction algorithm.
+# Why? we only need to compute the metric score for matching samples for donors that
+# are processed by the batch correction algorithm.
+# TODO can use list instead of set, because validation and unintegrated should not
+# share common samples? Keep set for now.
+sample_donor_dict = defaultdict(set)
+
+for sample_donor_pair in sample_donor_map_arr:
+    sample = sample_donor_pair[0]
+    donor = sample_donor_pair[1]
+    # print("sample: %s, donor: %s" % (sample, donor))
+    sample_donor_dict[donor].add(sample_donor_pair[0])
+
+emd_integrated_per_donor = []
+samples_for_emd = []
+
+for donor, samples in sample_donor_dict.items():
+
+    # test 1 donor for now.
+    # donor = list(sample_donor_dict.keys())[0]
+    # samples = sample_donor_dict[donor]
+
+    # the data for some samples may only be in the validation because they were not
+    # included as input for the batch correction algorithm.
+    input_integrated_donor = input_integrated[
+        input_integrated.obs["sample"].isin(samples)
+    ]
+    input_validation_donor = input_validation[
+        input_validation.obs["sample"].isin(samples)
+    ]
+
+    # keep just the markers we need to assess
+    input_integrated_donor = input_integrated_donor[:, markers_to_assess]
+    input_validation_donor = input_validation_donor[:, markers_to_assess]
+
+    input_for_emd = ad.concat([input_integrated_donor, input_validation_donor])
+    input_for_emd.layers["integrated"] = np.concatenate(
+        (
+            input_integrated_donor.layers["integrated"],
+            input_validation_donor.layers["preprocessed"],
+        )
+    )
+
+    # sanity check.. we need to have at least 2 batches..
+    if len(np.unique(input_for_emd.obs["batch"])) < 2:
+        raise Exception(
+            "The samples to calculate EMD_per_sample for donor %s only belong to 1 batch?"
+            % donor
+        )
+
+    print("Computing EMD for donor %s" % donor, flush=True)
+
+    # have to change the "sample" column to file_name for emd_comparison_from_anndata to work.
+    # Otherwise the _calculate_emd_per_frame used in cytonormpy will error because they
+    # harcoded the column file_name and use it in assert.
+    # See line 176 of https://github.com/TarikExner/CytoNormPy/blob/main/cytonormpy/_evaluation/_emd_utils.py#L173
+    input_for_emd.obs["file_name"] = input_for_emd.obs["sample"]
+
+    emd_integrated = cnp.emd_from_anndata(
+        adata=input_for_emd,
+        file_list=list(samples),
+        channels=markers_to_assess,
+        layer="integrated",
+        sample_identifier_column="file_name",
+    ).reset_index()
+
+    del emd_integrated["label"]
+
+    emd_integrated_per_donor.append(emd_integrated)
+    samples_for_emd.append(";".join(samples))
+
+emd_integrated_per_donor = pd.concat(emd_integrated_per_donor, ignore_index=True)
+
+max_emd = np.max(emd_integrated_per_donor)
+# mean_emd = np.mean(emd_integrated_per_donor)
+
+print("Assembling output AnnData", flush=True)
+output = ad.AnnData(
+    uns={
+        "dataset_id": input_integrated.uns["dataset_id"],
+        "method_id": input_integrated.uns["method_id"],
+        "metric_ids": ["max_emd"],
+        "metric_values": [max_emd],
+        "sample_ids": samples_for_emd,
+    }
+)
+
+print("Write output AnnData to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/metrics/emd_per_samples/script.py b/src/metrics/emd_per_samples/script.py
diff --git a/src/workflows/run_benchmark/config.vsh.yaml b/src/workflows/run_benchmark/config.vsh.yaml
@@ -18,11 +18,6 @@ argument_groups:
         type: file
         direction: input
         required: true
-      - name: "--samples_to_compare"
-        type: "string"
-        description: 2 samples to compare. Separate the sample names by comma
-        required: true
-        multiple: true
   - name: Outputs
     arguments:
       - name: "--output_scores"
@@ -73,7 +68,7 @@ dependencies:
   - name: control_methods/shuffle_integration_by_cell_type
   - name: methods/harmonypy
   - name: methods/limma_remove_batch_effect
-  - name: metrics/emd_per_samples
+  - name: metrics/emd
 
 runners:
   - type: nextflow
diff --git a/src/workflows/run_benchmark/main.nf b/src/workflows/run_benchmark/main.nf
@@ -16,7 +16,7 @@ methods = [
 
 // construct list of metrics
 metrics = [
-  emd_per_samples
+  emd
 ]
 
 workflow run_wf {
@@ -128,7 +128,6 @@ workflow run_wf {
         input_validation: "input_validation", 
         input_unintegrated: "input_unintegrated",
         input_integrated: "method_output",
-        samples_to_compare: "samples_to_compare"
       ],
       // use 'toState' to publish that component's outputs to the overall state
       toState: { id, output, state, comp ->