Teichlab · emdann · Nov 24, 2022 · Nov 28, 2022 · Dec 4, 2022 · Dec 4, 2022
diff --git a/README.md b/README.md
@@ -1,23 +1 @@
-# Genes2Genes
-### A new algorithm and framework for single-cell trajectory alignment 
-
-Genes2Genes aims to guide downstream comparative analysis of reference and query systems along any axis of progression (e.g. pseudotime) through both gene-level and cell-level alignment. 
-
-You can use this framework to perform comparisons such as:
-<ul>
-    <li>Organoid vs. Reference tissue
-    <li>Healthy vs. Disease
-    <li>Control vs. Treatment
-</ul>   
-
-#### Input to Genes2Genes
-(1) Reference adata (with `adata_ref.X` storing log1p normalised expression), 
-(2) Query adata (with `adata_query.X` storing log1p normalised expression), and
-(3) Pseudotime estimates stored in each adata object under  `adata.obs['time']`.
-
-**<span style="color:red">Note: This is the initial and testable version of Genes2Genes (in confidence -- still unpublished and under refinement) so you might run into unforseen errors and bugs. Please do let me know so that I can correct them before releasing the stable version. Thanks!</span>**
-
-### Tutorial
-
-Please refer to the tutorial notebook which gives steps to analyse an example reference and query dataset: 2 treatment groups of mouse-bone-marrow-derived Dendritic cells from Shalek et al (2014). The respective single-cell datasets along with pseudotime estimates were downloaded from CellAlign (Alpert et al 2018) and packaged into adata objects. 
-
+Simulation Trials
diff --git a/notebooks/TrajectorySimulatorUsingGPs.ipynb b/notebooks/TrajectorySimulatorUsingGPs.ipynb
diff --git a/notebooks/run_match_accuracy.py b/notebooks/run_match_accuracy.py
@@ -0,0 +1,155 @@
+import anndata
+import numpy as np
+import pandas as pd
+import seaborn as sb
+import scanpy as sc
+import scipy.stats as stats
+import matplotlib.pyplot as plt
+import seaborn as sns
+from tabulate import tabulate
+import os,sys,inspect
+import pickle
+# setting the path to source
+# sys.path.insert(0,os.path.dirname(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))) + '/source') 
+sys.path.append('../source') 
+
+# new source imports 
+import OrgAlign as orgalign
+import Main
+import MyFunctions 
+import TimeSeriesPreprocessor
+# import PathwayAnalyser
+
+import warnings
+warnings.filterwarnings("ignore")
+
+from sklearn.preprocessing import normalize
+import multiprocessing
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("mm_type",
+                    type=str)
+parser.add_argument("mm_size",
+                    type=int,
+                    help="path to working directory")
+parser.add_argument("--save_aligner",
+                    action='store_true')
+args = parser.parse_args()
+
+mm_type = args.mm_type
+mm_size = args.mm_size
+save_aligner = args.save_aligner
+
+def simulate_alignment2(adata, true_align_string, 
+                       frac_query = 0.5,
+                       seed=42352,
+                       gene = 'Msi1',
+                       n_stds = 3):
+    np.random.seed(seed)
+    n_bins=len(true_align_string)
+    adata.obs['time_bins'] = pd.cut(adata.obs['time'], n_bins).astype('category').cat.codes
+    q_cells= np.array([])
+
+    ## Split in ref and query
+    for i,b in enumerate(true_align_string):
+        n_cells = sum(adata.obs['time_bins'] == i)
+        q_cells_bin = np.random.choice(adata.obs_names[adata.obs['time_bins'] == i], size=int(np.round(n_cells*frac_query)), replace=False)
+        q_cells = np.hstack([q_cells, q_cells_bin])
+
+    adata_query = adata[q_cells].copy()
+    adata_ref = adata[~adata.obs_names.isin(q_cells)].copy()
+
+    ## Calculate shift for insertion
+    X_query = adata_query.X.copy()
+    X_gene = X_query[:,adata_query.var_names == gene]
+    # ins_shift = n_stds*X_gene.std()
+
+    ## Find max bin for insertion (using interpolated mean)
+    gene_list = adata_ref.var_names 
+    aligner = Main.RefQueryAligner(adata_ref, adata_query, gene_list, 40)
+    aligner.WEIGHT_BY_CELL_DENSITY = True
+    aligner.WINDOW_SIZE = 0.1
+    al_obj = aligner.align_single_pair(gene)
+    max_bin = np.array(al_obj.S.intpl_means).argmax()
+    X_insert = adata_query[adata_query.obs['time_bins'] == max_bin, gene].X
+    mean_shift = X_insert.mean()  + n_stds * X_insert.std() 
+    std_shift = X_insert.std()
+
+    for i,b in enumerate(true_align_string):
+        bcells = adata_query.obs_names[adata_query.obs['time_bins'] == i]
+        if b == 'D': ## delete cells
+            adata_query = adata_query[~adata_query.obs_names.isin(bcells)].copy()
+        if b == 'I': # change values for gene expression  
+            X_gene = X_query[adata_query.obs_names.isin(bcells),adata_query.var_names == gene]
+            X_ins = np.random.normal(loc=mean_shift, scale=std_shift, size=len(X_gene))
+            X_query[adata_query.obs_names.isin(bcells),adata_query.var_names == gene] = X_ins
+            adata_query.X = X_query.copy()
+
+    # Algorithm expect time spanning from 0 to 1
+    adata_ref.obs['time'] = TimeSeriesPreprocessor.Utils.minmax_normalise(adata_ref.obs['time'].values)
+    adata_query.obs['time'] = TimeSeriesPreprocessor.Utils.minmax_normalise(adata_query.obs['time'].values)
+    return(adata_ref, adata_query)
+
+def make_align_string(mm_type, mm_start = 10, n_bins = 50, mm_size=10):
+    mm_ixs = range(mm_start, mm_start+mm_size)
+    true_align_string = ''.join([mm_type if i in mm_ixs else 'M' for i in range(n_bins)])
+    return(true_align_string)
+
+def alignment_viz(aligner, al_obj):
+    # plt.subplots(1,2,figsize=(10,3))
+    # plt.subplot(1,2,1)
+    # al_obj.plotTimeSeries(aligner, plot_cells=True)
+    # plt.subplot(1,2,2)
+    # al_obj.plotTimeSeriesAlignment()
+    print(al_obj.al_visual)
+
+def predict_alignment(adata_ref, adata_query, gene, n_bins=50):
+    gene_list = adata_ref.var_names 
+    aligner = Main.RefQueryAligner(adata_ref, adata_query, gene_list, n_bins)
+    aligner.WEIGHT_BY_CELL_DENSITY = True
+    aligner.WINDOW_SIZE = 0.1
+    al_obj = aligner.align_single_pair(gene)
+    alignment_viz(aligner, al_obj)
+    return(al_obj)
+
+def get_ref_aling_str(al_obj):
+    ref_ixs = (al_obj.al_visual.split('\n')[1]).split(' Reference')[0]
+    al_str = al_obj.alignment_str
+    ref_aling_str = ''.join([al_str[i] for i,p in enumerate(ref_ixs) if p!=' ' and al_str[i] != 'V'])
+    return(ref_aling_str)
+
+
+def run_match_accuracy(params):
+    adata, gene, align_params, save_aligner = params
+    match_dict = {'D':'mismatch', 'I':'mismatch', 'M':'match', 'V':'match', 'W':'match'}
+    true_align_string = make_align_string(**align_params)
+    rdata, qdata = simulate_alignment2(adata, true_align_string, gene=gene)
+    al_obj = predict_alignment(rdata, qdata, gene=gene)
+    if save_aligner:
+        with open(f'./data/aligner_{gene}.{align_params["mm_type"]}.size{str(align_params["mm_size"])}.pkl', 'wb') as f:
+            pickle.dump(al_obj, f)
+    true_ref_align_str = get_ref_aling_str(al_obj)
+
+    # get mismatch accuracy
+    outcome_df = pd.DataFrame([(i, match_dict[true_align_string[i]], match_dict[c]) for i,c in enumerate(get_ref_aling_str(al_obj) )],
+                 columns=['position', 'true', 'predicted']
+                )
+    outcome_df['correct'] = outcome_df['true'] == outcome_df['predicted']
+    accuracy = outcome_df['correct'].sum()/outcome_df['correct'].shape[0]
+    outcome_df['accuracy'] = accuracy
+    outcome_df['gene'] = gene
+    for p in align_params.keys():
+        outcome_df[p] = align_params[p]
+    outcome_df = outcome_df[list(align_params.keys()) + ['gene', 'accuracy']].drop_duplicates()
+    return(outcome_df)
+
+adata = sc.read_h5ad("./data/match_accuracy_pancreas.h5ad")
+match_outcome = pd.DataFrame()
+pool = multiprocessing.Pool(7)
+outcomes_g = pool.map(run_match_accuracy, 
+                      [(adata, g, {'mm_type':mm_type, 'n_bins':50, 'mm_start':0, 'mm_size':mm_size}, args.save_aligner) for g in adata.var_names[adata.var['simulation_gene']]])
+pool.close()
+outcomes_g = pd.concat(outcomes_g)
+match_outcome = pd.concat([match_outcome, outcomes_g])
+match_outcome.to_csv(f'./data/match_accuracy_pancreas.{mm_type}.size{str(mm_size)}.csv')
diff --git a/notebooks/simulations.ipynb b/notebooks/simulations.ipynb