Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

match/mismatch accuracy on simulations #2

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
24 changes: 1 addition & 23 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,23 +1 @@
# Genes2Genes
### A new algorithm and framework for single-cell trajectory alignment

Genes2Genes aims to guide downstream comparative analysis of reference and query systems along any axis of progression (e.g. pseudotime) through both gene-level and cell-level alignment.

You can use this framework to perform comparisons such as:
<ul>
<li>Organoid vs. Reference tissue
<li>Healthy vs. Disease
<li>Control vs. Treatment
</ul>

#### Input to Genes2Genes
(1) Reference adata (with `adata_ref.X` storing log1p normalised expression),
(2) Query adata (with `adata_query.X` storing log1p normalised expression), and
(3) Pseudotime estimates stored in each adata object under `adata.obs['time']`.

**<span style="color:red">Note: This is the initial and testable version of Genes2Genes (in confidence -- still unpublished and under refinement) so you might run into unforseen errors and bugs. Please do let me know so that I can correct them before releasing the stable version. Thanks!</span>**

### Tutorial

Please refer to the tutorial notebook which gives steps to analyse an example reference and query dataset: 2 treatment groups of mouse-bone-marrow-derived Dendritic cells from Shalek et al (2014). The respective single-cell datasets along with pseudotime estimates were downloaded from CellAlign (Alpert et al 2018) and packaged into adata objects.

Simulation Trials
1,728 changes: 1,728 additions & 0 deletions notebooks/TrajectorySimulatorUsingGPs.ipynb

Large diffs are not rendered by default.

155 changes: 155 additions & 0 deletions notebooks/run_match_accuracy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
import anndata
import numpy as np
import pandas as pd
import seaborn as sb
import scanpy as sc
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate
import os,sys,inspect
import pickle
# setting the path to source
# sys.path.insert(0,os.path.dirname(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))) + '/source')
sys.path.append('../source')

# new source imports
import OrgAlign as orgalign
import Main
import MyFunctions
import TimeSeriesPreprocessor
# import PathwayAnalyser

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import normalize
import multiprocessing

import argparse
parser = argparse.ArgumentParser()
parser.add_argument("mm_type",
type=str)
parser.add_argument("mm_size",
type=int,
help="path to working directory")
parser.add_argument("--save_aligner",
action='store_true')
args = parser.parse_args()

mm_type = args.mm_type
mm_size = args.mm_size
save_aligner = args.save_aligner

def simulate_alignment2(adata, true_align_string,
frac_query = 0.5,
seed=42352,
gene = 'Msi1',
n_stds = 3):
np.random.seed(seed)
n_bins=len(true_align_string)
adata.obs['time_bins'] = pd.cut(adata.obs['time'], n_bins).astype('category').cat.codes
q_cells= np.array([])

## Split in ref and query
for i,b in enumerate(true_align_string):
n_cells = sum(adata.obs['time_bins'] == i)
q_cells_bin = np.random.choice(adata.obs_names[adata.obs['time_bins'] == i], size=int(np.round(n_cells*frac_query)), replace=False)
q_cells = np.hstack([q_cells, q_cells_bin])

adata_query = adata[q_cells].copy()
adata_ref = adata[~adata.obs_names.isin(q_cells)].copy()

## Calculate shift for insertion
X_query = adata_query.X.copy()
X_gene = X_query[:,adata_query.var_names == gene]
# ins_shift = n_stds*X_gene.std()

## Find max bin for insertion (using interpolated mean)
gene_list = adata_ref.var_names
aligner = Main.RefQueryAligner(adata_ref, adata_query, gene_list, 40)
aligner.WEIGHT_BY_CELL_DENSITY = True
aligner.WINDOW_SIZE = 0.1
al_obj = aligner.align_single_pair(gene)
max_bin = np.array(al_obj.S.intpl_means).argmax()
X_insert = adata_query[adata_query.obs['time_bins'] == max_bin, gene].X
mean_shift = X_insert.mean() + n_stds * X_insert.std()
std_shift = X_insert.std()

for i,b in enumerate(true_align_string):
bcells = adata_query.obs_names[adata_query.obs['time_bins'] == i]
if b == 'D': ## delete cells
adata_query = adata_query[~adata_query.obs_names.isin(bcells)].copy()
if b == 'I': # change values for gene expression
X_gene = X_query[adata_query.obs_names.isin(bcells),adata_query.var_names == gene]
X_ins = np.random.normal(loc=mean_shift, scale=std_shift, size=len(X_gene))
X_query[adata_query.obs_names.isin(bcells),adata_query.var_names == gene] = X_ins
adata_query.X = X_query.copy()

# Algorithm expect time spanning from 0 to 1
adata_ref.obs['time'] = TimeSeriesPreprocessor.Utils.minmax_normalise(adata_ref.obs['time'].values)
adata_query.obs['time'] = TimeSeriesPreprocessor.Utils.minmax_normalise(adata_query.obs['time'].values)
return(adata_ref, adata_query)

def make_align_string(mm_type, mm_start = 10, n_bins = 50, mm_size=10):
mm_ixs = range(mm_start, mm_start+mm_size)
true_align_string = ''.join([mm_type if i in mm_ixs else 'M' for i in range(n_bins)])
return(true_align_string)

def alignment_viz(aligner, al_obj):
# plt.subplots(1,2,figsize=(10,3))
# plt.subplot(1,2,1)
# al_obj.plotTimeSeries(aligner, plot_cells=True)
# plt.subplot(1,2,2)
# al_obj.plotTimeSeriesAlignment()
print(al_obj.al_visual)

def predict_alignment(adata_ref, adata_query, gene, n_bins=50):
gene_list = adata_ref.var_names
aligner = Main.RefQueryAligner(adata_ref, adata_query, gene_list, n_bins)
aligner.WEIGHT_BY_CELL_DENSITY = True
aligner.WINDOW_SIZE = 0.1
al_obj = aligner.align_single_pair(gene)
alignment_viz(aligner, al_obj)
return(al_obj)

def get_ref_aling_str(al_obj):
ref_ixs = (al_obj.al_visual.split('\n')[1]).split(' Reference')[0]
al_str = al_obj.alignment_str
ref_aling_str = ''.join([al_str[i] for i,p in enumerate(ref_ixs) if p!=' ' and al_str[i] != 'V'])
return(ref_aling_str)


def run_match_accuracy(params):
adata, gene, align_params, save_aligner = params
match_dict = {'D':'mismatch', 'I':'mismatch', 'M':'match', 'V':'match', 'W':'match'}
true_align_string = make_align_string(**align_params)
rdata, qdata = simulate_alignment2(adata, true_align_string, gene=gene)
al_obj = predict_alignment(rdata, qdata, gene=gene)
if save_aligner:
with open(f'./data/aligner_{gene}.{align_params["mm_type"]}.size{str(align_params["mm_size"])}.pkl', 'wb') as f:
pickle.dump(al_obj, f)
true_ref_align_str = get_ref_aling_str(al_obj)

# get mismatch accuracy
outcome_df = pd.DataFrame([(i, match_dict[true_align_string[i]], match_dict[c]) for i,c in enumerate(get_ref_aling_str(al_obj) )],
columns=['position', 'true', 'predicted']
)
outcome_df['correct'] = outcome_df['true'] == outcome_df['predicted']
accuracy = outcome_df['correct'].sum()/outcome_df['correct'].shape[0]
outcome_df['accuracy'] = accuracy
outcome_df['gene'] = gene
for p in align_params.keys():
outcome_df[p] = align_params[p]
outcome_df = outcome_df[list(align_params.keys()) + ['gene', 'accuracy']].drop_duplicates()
return(outcome_df)

adata = sc.read_h5ad("./data/match_accuracy_pancreas.h5ad")
match_outcome = pd.DataFrame()
pool = multiprocessing.Pool(7)
outcomes_g = pool.map(run_match_accuracy,
[(adata, g, {'mm_type':mm_type, 'n_bins':50, 'mm_start':0, 'mm_size':mm_size}, args.save_aligner) for g in adata.var_names[adata.var['simulation_gene']]])
pool.close()
outcomes_g = pd.concat(outcomes_g)
match_outcome = pd.concat([match_outcome, outcomes_g])
match_outcome.to_csv(f'./data/match_accuracy_pancreas.{mm_type}.size{str(mm_size)}.csv')
1,498 changes: 1,498 additions & 0 deletions notebooks/simulations.ipynb

Large diffs are not rendered by default.

Loading