add metrics

openproblems-bio · Sep 21, 2024 · 245cb0c · 245cb0c
1 parent 16ddbbe
commit 245cb0c
Show file tree

Hide file tree

Showing 24 changed files with 1,146 additions and 1 deletion.
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@
 /output
 trace-*
 .ipynb_checkpoints
+__pycache__
diff --git a/common b/common
diff --git a/src/metrics/asw_batch/config.vsh.yaml b/src/metrics/asw_batch/config.vsh.yaml
@@ -0,0 +1,52 @@
+__merge__: /src/api/comp_metric.yaml
+name: asw_batch
+info:
+  metrics:
+    - name: asw_batch
+      label: ASW batch
+      summary: Modified average silhouette width (ASW) of batch
+      # TODO: transform into more readable markdown with proper formulae formatting
+      description: |
+          We consider the absolute silhouette width, s(i), on
+          batch labels per cell i. Here, 0 indicates that batches are well mixed, and any
+          deviation from 0 indicates a batch effect:
+          𝑠batch(𝑖)=|𝑠(𝑖)|.
+
+          To ensure higher scores indicate better batch mixing, these scores are scaled by
+          subtracting them from 1. As we expect batches to integrate within cell identity
+          clusters, we compute the batchASWj score for each cell label j separately,
+          using the equation:
+          batchASW𝑗=1|𝐶𝑗|∑𝑖∈𝐶𝑗1−𝑠batch(𝑖),
+
+          where Cj is the set of cells with the cell label j and |Cj| denotes the number of cells
+          in that set.
+
+          To obtain the final batchASW score, the label-specific batchASWj scores are averaged:
+          batchASW=1|𝑀|∑𝑗∈𝑀batchASW𝑗.
+
+          Here, M is the set of unique cell labels.
+      references:
+        doi: 10.1038/s41592-021-01336-8
+      links:
+        homepage: https://scib.readthedocs.io/en/latest/
+        documentation: https://scib.readthedocs.io/en/latest/api/scib.metrics.silhouette_batch.html
+        repository: https://github.com/theislab/scib
+      min: 0
+      max: 1
+      maximize: true
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - scib==1.1.5
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/metrics/asw_batch/script.py b/src/metrics/asw_batch/script.py
@@ -0,0 +1,44 @@
+import sys
+import anndata as ad
+from scib.metrics import silhouette_batch
+
+## VIASH START
+par = {
+    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'output': 'output.h5ad',
+}
+meta = {
+    'name': 'foo',
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+
+print('Read input', flush=True)
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns')
+adata.obs = read_anndata(par['input_solution'], obs='obs').obs
+adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
+
+print('compute score', flush=True)
+score = silhouette_batch(
+    adata,
+    batch_key='batch',
+    label_key='label',
+    embed='X_emb',
+)
+
+print('Create output AnnData object', flush=True)
+output = ad.AnnData(
+    uns={
+        'dataset_id': adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        'method_id': adata.uns['method_id'],
+        'metric_ids': [ meta['name'] ],
+        'metric_values': [ score ]
+    }
+)
+
+print('Write data to file', flush=True)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/asw_label/config.vsh.yaml b/src/metrics/asw_label/config.vsh.yaml
@@ -0,0 +1,40 @@
+__merge__: /src/api/comp_metric.yaml
+name: asw_label
+info:
+  metrics:
+    - name: asw_label
+      label: ASW Label
+      summary: Average silhouette of cell identity labels (cell types)
+      # TODO: transform into more readable markdown with proper formulae formatting
+      description: |
+        For the bio-conservation score, the ASW was computed on cell identity labels and
+        scaled to a value between 0 and 1 using the equation:
+        celltypeASW=(ASW_C+1)/2,
+
+        where C denotes the set of all cell identity labels.
+        For information about the batch silhouette score, check sil_batch.
+      references:
+        doi: 10.1038/s41592-021-01336-8
+      links:
+        homepage: https://scib.readthedocs.io/en/latest/
+        documentation: https://scib.readthedocs.io/en/latest/api/scib.metrics.silhouette_batch.html
+        repository: https://github.com/theislab/scib
+      min: 0
+      max: 1
+      maximize: true
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - scib==1.1.5
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/metrics/asw_label/script.py b/src/metrics/asw_label/script.py
@@ -0,0 +1,44 @@
+import sys
+import anndata as ad
+from scib.metrics import silhouette
+
+## VIASH START
+par = {
+    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'output': 'output.h5ad',
+}
+
+meta = {
+    'name': 'foo',
+}
+## VIASH END
+
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+
+print('Read input', flush=True)
+adata = read_anndata(par['input_integrated'], obs='obs', obsm='obsm', uns='uns')
+adata.obs = read_anndata(par['input_solution'], obs='obs').obs
+adata.uns |= read_anndata(par['input_solution'], uns='uns').uns
+
+print('compute score', flush=True)
+score = silhouette(
+    adata,
+    label_key='label',
+    embed='X_emb'
+)
+
+print("Create output AnnData object", flush=True)
+output = ad.AnnData(
+    uns={
+        "dataset_id": adata.uns['dataset_id'],
+        'normalization_id': adata.uns['normalization_id'],
+        "method_id": adata.uns['method_id'],
+        "metric_ids": [meta['name']],
+        "metric_values": [score]
+    }
+)
+
+print("Write data to file", flush=True)
+output.write_h5ad(par["output"], compression="gzip")
diff --git a/src/metrics/cell_cycle_conservation/config.vsh.yaml b/src/metrics/cell_cycle_conservation/config.vsh.yaml
@@ -0,0 +1,50 @@
+__merge__: /src/api/comp_metric.yaml
+name: cell_cycle_conservation
+info:
+  metrics:
+    - name: cell_cycle_conservation
+      label: Cell Cycle Conservation
+      summary: Cell cycle conservation score based on principle component regression
+        on cell cycle gene scores
+      # TODO: transform into more readable markdown with proper formulae formatting
+      description: |
+        The cell-cycle conservation score evaluates how well the cell-cycle effect can be
+        captured before and after integration. We computed cell-cycle scores using Scanpy's
+        score_cell_cycle function with a reference gene set from Tirosh et al for the
+        respective cell-cycle phases. We used the same set of cell-cycle genes for mouse and
+        human data (using capitalization to convert between the gene symbols). We then computed
+        the variance contribution of the resulting S and G2/M phase scores using principal
+        component regression (Principal component regression), which was performed for each
+        batch separately. The differences in variance before, Varbefore, and after, Varafter,
+        integration were aggregated into a final score between 0 and 1, using the equation:
+        CCconservation=1−|Varafter−Varbefore|/Varbefore.
+
+        In this equation, values close to 0 indicate lower conservation and 1 indicates complete
+        conservation of the variance explained by cell cycle. In other words, the variance
+        remains unchanged within each batch for complete conservation, while any deviation from
+        the preintegration variance contribution reduces the score.
+      references:
+        doi: 10.1038/s41592-021-01336-8
+      links:
+        homepage: https://scib.readthedocs.io/en/latest/
+        documentation: https://scib.readthedocs.io/en/latest/api/scib.metrics.silhouette_batch.html
+        repository: https://github.com/theislab/scib
+      min: 0
+      max: 1
+      maximize: true
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - scib==1.1.5
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
diff --git a/src/metrics/cell_cycle_conservation/script.py b/src/metrics/cell_cycle_conservation/script.py
@@ -0,0 +1,69 @@
+import sys
+import anndata as ad
+from scib.metrics import cell_cycle
+import numpy as np
+
+## VIASH START
+par = {
+    'input_integrated': 'resources_test/task_batch_integration/cxg_mouse_pancreas_atlas/integrated_full.h5ad',
+    'output': 'output.h5ad'
+}
+
+meta = {
+    'name': 'foo'
+}
+## VIASH END
+sys.path.append(meta["resources_dir"])
+from read_anndata_partial import read_anndata
+
+
+print('Read input', flush=True)
+adata_solution = read_anndata(
+    par['input_solution'],
+    X='layers/normalized',
+    obs='obs',
+    var='var',
+    uns='uns'
+)
+adata_integrated = read_anndata(
+    par['input_integrated'],
+    obs='obs',
+    obsm='obsm',
+    uns='uns'
+)
+
+print('Use gene symbols for features', flush=True)
+adata_solution.var_names = adata_solution.var['feature_name']
+
+translator = {
+    "homo_sapiens": "human",
+    "mus_musculus": "mouse",
+}
+
+print('Compute score', flush=True)
+if adata_solution.uns['dataset_organism'] not in translator:
+    score = np.nan
+else:
+    organism = translator[adata_solution.uns['dataset_organism']]
+    score = cell_cycle(
+        adata_solution,
+        adata_integrated,
+        batch_key='batch',
+        embed='X_emb',
+        organism=organism,
+    )
+
+print('Create output AnnData object', flush=True)
+output = ad.AnnData(
+    uns={
+        'dataset_id': adata_solution.uns['dataset_id'],
+        'normalization_id': adata_solution.uns['normalization_id'],
+        'method_id': adata_integrated.uns['method_id'],
+        'metric_ids': [ meta['name'] ],
+        'metric_values': [ score ]
+    }
+)
+
+
+print('Write data to file', flush=True)
+output.write_h5ad(par['output'], compression='gzip')
diff --git a/src/metrics/clustering_overlap/config.vsh.yaml b/src/metrics/clustering_overlap/config.vsh.yaml
@@ -0,0 +1,66 @@
+__merge__: /src/api/comp_metric.yaml
+name: clustering_overlap
+info:
+  metrics:
+    - name: ari
+      label: ARI
+      summary: Adjusted Rand Index compares clustering overlap, correcting for random
+        labels and considering correct overlaps and disagreements.
+      description: |
+        The Adjusted Rand Index (ARI) compares the overlap of two clusterings;
+        it considers both correct clustering overlaps while also counting correct
+        disagreements between two clusterings.
+        We compared the cell-type labels with the NMI-optimized
+        Louvain clustering computed on the integrated dataset.
+        The adjustment of the Rand index corrects for randomly correct labels.
+        An ARI of 0 or 1 corresponds to random labeling or a perfect match,
+        respectively.
+      references:
+        doi:
+          - 10.1038/s41592-021-01336-8
+          - 10.1007/bf01908075
+      links:
+        homepage: https://scib.readthedocs.io/en/latest/
+        documentation: https://scib.readthedocs.io/en/latest/api/scib.metrics.silhouette_batch.html
+        repository: https://github.com/theislab/scib
+      min: 0
+      max: 1
+      maximize: true
+    - name: nmi
+      label: NMI
+      summary: NMI compares overlap by scaling using mean entropy terms and optimizing
+        Louvain clustering to obtain the best match between clusters and labels.
+      description: |
+        Normalized Mutual Information (NMI) compares the overlap of two clusterings.
+        We used NMI to compare the cell-type labels with Louvain clusters computed on
+        the integrated dataset. The overlap was scaled using the mean of the entropy terms
+        for cell-type and cluster labels. Thus, NMI scores of 0 or 1 correspond to uncorrelated
+        clustering or a perfect match, respectively. We performed optimized Louvain clustering
+        for this metric to obtain the best match between clusters and labels.
+      references:
+        doi:
+          - 10.1145/2808797.2809344
+          - 10.1038/s41592-021-01336-8
+      links:
+        homepage: https://scib.readthedocs.io/en/latest/
+        documentation: https://scib.readthedocs.io/en/latest/api/scib.metrics.silhouette_batch.html
+        repository: https://github.com/theislab/scib
+      min: 0
+      max: 1
+      maximize: true
+resources:
+  - type: python_script
+    path: script.py
+  - path: /src/utils/read_anndata_partial.py
+engines:
+  - type: docker
+    image: openproblems/base_python:1.0.0
+    setup:
+      - type: python
+        pypi:
+          - scib==1.1.5
+runners:
+  - type: executable
+  - type: nextflow
+    directives:
+      label: [midtime, midmem, lowcpu]
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ @@
     /output
     trace-*
     .ipynb_checkpoints
+    __pycache__