Make feature branch up to date

nadeemlab · Mar 25, 2024 · 15855bb · 15855bb
2 parents 9d7e218 + 3c58b19
commit 15855bb
Show file tree

Hide file tree

Showing 31 changed files with 1,293 additions and 71 deletions.
diff --git a/analysis_replication/README.md b/analysis_replication/README.md
@@ -1,7 +1,7 @@
 
 # Reproducible analyses
 
-The scripts in this directory reproduce the analyses of the curated datasets, in the order they are mentioned in the article.
+The scripts in this directory reproduce the analyses of the curated datasets, in almost exactly the same order that they are mentioned in the article.
 
 These scripts were written as a record of usage of the dashboard web application, which provides the same results.
 
@@ -22,3 +22,12 @@ substituting the argument with the address of your local API server. (See *Setti
 - These scripts just call the web API, and so they do not require Python package `spatialprofilingtoolbox`.
 - You can alternatively store the API host in `api_host.txt` and omit the command-line argument above.
 - The run result is here in [results.txt](results.txt).
+
+# Figure generation
+
+One figure is generated programmatically from published source TIFF files.
+To run the figure generation script, alter the command below to reference your own database configuration file and path to unzipped Moldoveanu et al dataset.
+
+```bash
+python retrieve_example_plot.py dataset_directory/ ~/.spt_db.config
+```
diff --git a/analysis_replication/accessors.py b/analysis_replication/accessors.py
@@ -2,10 +2,8 @@
 from typing import cast
 import re
 from itertools import chain
-from urllib.request import urlopen
 from urllib.parse import urlencode
-from urllib.error import HTTPError
-import json
+from requests import get as get_request  # type: ignore
 from os.path import exists
 from time import sleep
 
@@ -17,6 +15,7 @@
 from numpy import mean
 from numpy import log
 from scipy.stats import ttest_ind  # type: ignore
+from sklearn.metrics import auc  # type:ignore
 
 
 def get_default_host(given: str | None) -> str | None:
@@ -219,12 +218,11 @@ def _retrieve(self, endpoint, query):
         base = f'{self._get_base()}'
         url = '/'.join([base, endpoint, '?' + query])
         try:
-            with urlopen(url) as response:
-                content = response.read()
-        except HTTPError as exception:
+            content = get_request(url)
+        except Exception as exception:
             print(url)
             raise exception
-        return json.loads(content), url
+        return content.json(), url
 
     def _phenotype_criteria(self, name):
         if isinstance(name, dict):
@@ -341,15 +339,35 @@ def handle_expected_actual(expected: float, actual: float | None):
     print(Colors.bold_green + padded + Colors.reset, end='')
 
 
+def compute_auc(list1: list[float], list2: list[float]) -> float:
+    pairs = [(value, 0) for value in list1] + [(value, 1) for value in list2]
+    pairs.sort(key=lambda pair: pair[0])
+    total_labelled = sum([pair[1] for pair in pairs])
+    total_unlabelled = len(pairs) - total_labelled
+    graph_points = [(0.0, 1.0)]
+    true_positives = 0
+    true_negatives = total_unlabelled
+    for _, label in pairs:
+        if label == 1:
+            true_positives = true_positives + 1
+        else:
+            true_negatives = true_negatives - 1
+        graph_points.append((true_positives / total_labelled, true_negatives / total_unlabelled))
+    _auc = auc([p[0] for p in graph_points], [p[1] for p in graph_points])
+    _auc = max(_auc, 1 - _auc)
+    return _auc
+
+
 def univariate_pair_compare(
-        list1,
-        list2,
-        expected_fold=None,
-        do_log_fold: bool = False,
-        show_pvalue=False,
+    list1,
+    list2,
+    expected_fold=None,
+    do_log_fold: bool = False,
+    show_pvalue=False,
+    show_auc=False,
 ):
-    list1 = list(filter(lambda element: not isnan(element), list1.values))
-    list2 = list(filter(lambda element: not isnan(element), list2.values))
+    list1 = list(filter(lambda element: not isnan(element) and not element==inf, list1.values))
+    list2 = list(filter(lambda element: not isnan(element) and not element==inf, list2.values))
 
     mean1 = float(mean(list1))
     mean2 = float(mean(list2))
@@ -368,16 +386,33 @@ def univariate_pair_compare(
 
     if show_pvalue:
         if do_log_fold:
-            result = ttest_ind(_list1, _list2)
+            result = ttest_ind(_list1, _list2, equal_var=False)
             print(
                 '  p-value (after log): ' + Colors.blue + str(result.pvalue) + Colors.reset, end=''
             )
         else:
-            result = ttest_ind(list1, list2)
+            result = ttest_ind(list1, list2, equal_var=False)
             print('  p-value: ' + Colors.blue + str(result.pvalue) + Colors.reset, end='')
 
+    if show_auc:
+        _auc = compute_auc(list1, list2)
+        print('  AUC: ' + Colors.blue + str(_auc) + Colors.reset, end='')
+
     print('')
 
 
-def print_comparison() -> None:
-    pass
+def get_fractions(df, column_numerator, column_denominator, cohort1, cohort2, omit_zeros=True):
+    fractions = df[column_numerator] / df[column_denominator]
+    if omit_zeros:
+        mask = ~ ( (df[column_numerator] == 0) | (df[column_denominator] == 0) )
+        total1 = sum((df['cohort'] == cohort1))
+        omit1 = total1 - sum((df['cohort'] == cohort1) & mask)
+        total2 = sum((df['cohort'] == cohort2))
+        omit2 = total2 - sum((df['cohort'] == cohort2) & mask)
+        if omit1 !=0 or omit2 !=0:
+            print(f'(Omitting {omit1}/{total1} from {cohort1} and {omit2}/{total2} from {cohort2}.)')
+    else:
+        mask = True
+    fractions1 = fractions[(df['cohort'] == cohort1) & mask]
+    fractions2 = fractions[(df['cohort'] == cohort2) & mask]
+    return fractions1, fractions2
diff --git a/analysis_replication/api_host.txt b/analysis_replication/api_host.txt
@@ -0,0 +1 @@
+http://oncopathtk.org/api
diff --git a/analysis_replication/breast_imc.py b/analysis_replication/breast_imc.py
@@ -24,7 +24,7 @@ def test(host):
     values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT7+ CK+']
     # handle_expected_actual(1.6216, mean2 / mean1)
     # # handle_expected_actual(1.69, mean2 / mean1)
-    compare(values1, values2, expected_fold=1.6216, show_pvalue=True)
+    compare(values1, values2, expected_fold=1.6216, show_pvalue=True, show_auc=True)
 
     df = access.proximity([KRT['14'], KRT['5']])
     values2 = df[df['cohort'] == '2']['proximity, KRT14+ CK+ and KRT5+ CK+']

diff --git a/analysis_replication/luad_imc.py b/analysis_replication/luad_imc.py
@@ -14,17 +14,17 @@ def test(host):
     df = access.spatial_autocorrelation('B cell')
     values1 = df[df['cohort'] == '1']['spatial autocorrelation, B cell']
     values2 = df[df['cohort'] == '2']['spatial autocorrelation, B cell']
-    compare(values1, values2, expected_fold=1.571, show_pvalue=True)
+    compare(values1, values2, expected_fold=1.513, show_pvalue=True)
 
     df = access.neighborhood_enrichment(['CD163+ macrophage', 'Regulatory T cell'])
     values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD163+ macrophage and Regulatory T cell']
     values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD163+ macrophage and Regulatory T cell']
-    compare(values1, values2, expected_fold=797.46, do_log_fold=True, show_pvalue=True)
+    compare(values1, values2, expected_fold=467.1, do_log_fold=True, show_pvalue=True)
 
     df = access.neighborhood_enrichment(['CD163+ macrophage', 'Endothelial cell'])
     values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD163+ macrophage and Endothelial cell']
     values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD163+ macrophage and Endothelial cell']
-    compare(values1, values2, expected_fold=9.858, do_log_fold=True, show_pvalue=True)
+    compare(values1, values2, expected_fold=5.336, do_log_fold=True, show_pvalue=True)
 
     klrd1 = {'positive_markers': ['KLRD1'], 'negative_markers': []}
     cd14 = {'positive_markers': ['CD14'], 'negative_markers': []}

diff --git a/analysis_replication/melanoma_ici.py b/analysis_replication/melanoma_ici.py
@@ -1,8 +1,6 @@
 """Data analysis script for one dataset."""
 import sys
 
-from numpy import mean
-
 from accessors import DataAccessor
 from accessors import get_default_host
 from accessors import univariate_pair_compare as compare
@@ -11,25 +9,104 @@ def test(host):
     study = 'Melanoma CyTOF ICI'
     access = DataAccessor(study, host=host)
 
-    antigen_experienced_cytotoxic = {'positive_markers': ['CD8A', 'CD3', 'CD45RO'], 'negative_markers': []}
+    name1 = 'Cytotoxic T cell antigen-experienced'
+    name2 = 'Melanoma'
+
+    print('Not especially statistically significant, but matches prior finding:')
+    df = access.proximity([name1, name2])
+    prox = f'proximity, {name1} and {name2}'
+    print(f'Proximity of {name1} to {name2}')
+    values1 = df[df['cohort'] == '1'][prox]
+    values2 = df[df['cohort'] == '2'][prox]
+    compare(values1, values2, expected_fold=1.66, show_pvalue=True, show_auc=True)
+
+    df = access.proximity([name2, name1])
+    prox = f'proximity, {name2} and {name1}'
+    print(f'Proximity of {name2} to {name1}')
+    values1 = df[df['cohort'] == '1'][prox]
+    values2 = df[df['cohort'] == '2'][prox]
+    compare(values1, values2, expected_fold=1.34, show_pvalue=True, show_auc=True)
+
+    df = access.neighborhood_enrichment([name1, name2])
+    ne = f'neighborhood enrichment, {name1} and {name2}'
+    print(f'Neighborhood enrichment for {name1} w.r.t. {name2}.')
+    values1 = df[df['cohort'] == '1'][ne]
+    values2 = df[df['cohort'] == '2'][ne]
+    compare(values1, values2, expected_fold=0.75, show_pvalue=True)
+
+    df = access.neighborhood_enrichment([name2, name1])
+    ne = f'neighborhood enrichment, {name2} and {name1}'
+    print(f'Neighborhood enrichment for {name2} w.r.t. {name1}.')
+    values1 = df[df['cohort'] == '1'][ne]
+    values2 = df[df['cohort'] == '2'][ne]
+    compare(values1, values2, expected_fold=0.405, show_pvalue=True)
+
+    # Two phenotypes spatial
+    print('\nResults involving spatial metrics for 2 phenotypes (shown in both orders for reference)')
+    for p1, p2, fold in zip(
+        ['Cytotoxic T cell antigen-experienced', 'T regulatory cells', 'Cytotoxic T cell antigen-experienced', 'Naive cytotoxic T cell'],
+        ['T regulatory cells', 'Cytotoxic T cell antigen-experienced', 'Naive cytotoxic T cell', 'Cytotoxic T cell antigen-experienced'],
+        [0.285, 0.369, 0.299, 0.5],
+    ):
+        df = access.neighborhood_enrichment([p1, p2])
+        ne = f'neighborhood enrichment, {p1} and {p2}'
+        values1 = df[df['cohort'] == '1'][ne].dropna()
+        values2 = df[df['cohort'] == '2'][ne].dropna()
+        print(f'Neighborhood enrichment for {p1} and {p2}')
+        compare(values1, values2, expected_fold=fold, show_pvalue=True, do_log_fold=True, show_auc=True)
+
+    for p1, p2, fold in zip(
+        ['Naive cytotoxic T cell', 'T regulatory cells', 'Naive cytotoxic T cell', 'T helper cell antigen-experienced'],
+        ['T regulatory cells', 'Naive cytotoxic T cell', 'T helper cell antigen-experienced', 'Naive cytotoxic T cell'],
+        [0.683, 0.417, 0.899, 0.400],
+    ):
+        df = access.proximity([p1, p2])
+        prox = f'proximity, {p1} and {p2}'
+        print(f'Proximity of {p1} to {p2}')
+        values1 = df[df['cohort'] == '1'][prox]
+        values2 = df[df['cohort'] == '2'][prox]
+        compare(values1, values2, expected_fold=fold, show_pvalue=True, show_auc=True)
+
+    print('\nSingle channel or phenotype results')
+
+    def p(channel):
+        return {'positive_markers':[channel], 'negative_markers':[]}
+
+    CD3CD45RO = {'positive_markers': ['CD3', 'CD45RO'], 'negative_markers': []}
+    name = 'CD3+ CD45RO+'
+    fold = 0.000878
+    df = access.neighborhood_enrichment([CD3CD45RO, CD3CD45RO])
+    ne = f'neighborhood enrichment, {name} and {name}'
+    values1 = df[df['cohort'] == '1'][ne].dropna()
+    values2 = df[df['cohort'] == '2'][ne].dropna()
+    print(f'Neighborhood enrichment for {name} and itself')
+    compare(values1, values2, expected_fold=fold, show_pvalue=True, do_log_fold=True, show_auc=True)
+
+    ### Channel
+    for channel, fold in zip(['ICOS'], [0.791]):
+        df = access.proximity([p(channel), p(channel)])
+        prox = f'proximity, {channel}+ and {channel}+'
+        print(f'Proximity of {channel} to {channel}')
+        values1 = df[df['cohort'] == '1'][prox]
+        values2 = df[df['cohort'] == '2'][prox]
+        compare(values1, values2, expected_fold=fold, show_pvalue=True, show_auc=True)
 
-    # The average value of the neighborhood enrichment score for phenotype(s) CD3+ CD45RO+ CD8A+ and
-    # Melanoma is 1.39 times higher in cohort 1 than in cohort 2.
-    df = access.neighborhood_enrichment([antigen_experienced_cytotoxic, 'Melanoma'])
-    values1 = df[df['cohort'] == '1']['neighborhood enrichment, CD8A+ CD3+ CD45RO+ and Melanoma']
-    values2 = df[df['cohort'] == '2']['neighborhood enrichment, CD8A+ CD3+ CD45RO+ and Melanoma']
-    # handle_expected_actual(1.234, mean1 / mean2)
-    # # handle_expected_actual(1.39, mean1 / mean2)
-    compare(values2, values1, expected_fold=1.234, do_log_fold=True)
+    ### Phenotype
+    for phenotype, fold in zip(['B cells'], [0.0826]):
+        df = access.spatial_autocorrelation(phenotype)
+        autocorr = f'spatial autocorrelation, {phenotype}'
+        values1 = df[df['cohort'] == '1'][autocorr]
+        values2 = df[df['cohort'] == '2'][autocorr]
+        print(f'Spatial autocorrelation for {phenotype}')
+        compare(values1, values2, expected_fold=fold, show_pvalue=True, show_auc=True)
 
-    # On average, the fraction of cells that are CD8A+ CD3+ CD45RO+ and MKI67+ is 1.41 times higher
-    # in cohort 1 than in cohort 2.
-    proliferative = {'positive_markers': ['MKI67'], 'negative_markers': []}
-    df = access.counts([antigen_experienced_cytotoxic, proliferative])
-    fractions = df['CD8A+ CD3+ CD45RO+ and MKI67+'] / df['all cells']
-    fractions1 = fractions[df['cohort'] == '1']
-    fractions2 = fractions[df['cohort'] == '2']
-    compare(fractions2, fractions1, expected_fold=1.41)
+    for phenotype, fold in zip(['T helper cell antigen-experienced', 'Cytotoxic T cell antigen-experienced'], [12.42, 0.278]):
+        df = access.neighborhood_enrichment([phenotype, phenotype])
+        ne = f'neighborhood enrichment, {phenotype} and {phenotype}'
+        values1 = df[df['cohort'] == '1'][ne].dropna()
+        values2 = df[df['cohort'] == '2'][ne].dropna()
+        print(f'Neighborhood enrichment for {phenotype} and itself')
+        compare(values1, values2, expected_fold=fold, show_pvalue=True, show_auc=True)
 
 
 if __name__=='__main__':
@@ -40,4 +117,5 @@ def test(host):
         host = get_default_host(None)
     if host is None:
         raise RuntimeError('Could not determine API server.')
+    print(f'Using API server: {host}')
     test(host)