Skip to content

Commit

Permalink
util module
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Sep 19, 2024
1 parent 2294214 commit 79a6f65
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 101 deletions.
86 changes: 31 additions & 55 deletions runs.ipynb

Large diffs are not rendered by default.

9 changes: 8 additions & 1 deletion src/api/comp_method.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,18 @@ functionality:
type: boolean
direction: input
default: false
description: normalize rna seq data before inference. currently, it's only applicable to baseline models
- name: --only_hvgs
type: boolean
direction: input
default: false
description: subset rna seq data to only 7000 hvgs to reduce dimensionality





resources:
- path: ../utils/util.py

test_resources:
- type: python_script
Expand Down
53 changes: 9 additions & 44 deletions src/control_methods/pearson/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from tqdm import tqdm
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler
from utils.util import process_data, create_corr_net

## VIASH START
par = {
Expand All @@ -15,62 +16,26 @@
'cell_type_specific': True,
'max_n_links': 50000,
'prediction': 'resources/grn_models/donor_0_default/pearson.csv',
"seed": 32
"seed": 32,
'only_hvgs': True,
'normalize': False
}
## VIASH END
print(par)

def process_links(net, par):
net = net[net.source!=net.target]
net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
return net

def corr_net(X, gene_names, par):
X = StandardScaler().fit_transform(X)
net = np.dot(X.T, X) / X.shape[0]
net = pd.DataFrame(net, index=gene_names, columns=gene_names)
net = net.sample(len(tf_all), axis=1, random_state=par['seed'])
net = net.reset_index()
index_name = net.columns[0]
net = net.melt(id_vars=index_name, var_name='source', value_name='weight')

net.rename(columns={index_name: 'target'}, inplace=True)
net = process_links(net, par)

return net

def create_corr_net(X, gene_names, groups, par):
if par['cell_type_specific']:
i = 0
for group in tqdm(np.unique(groups), desc="Processing groups"):
X_sub = X[groups == group, :]
net = corr_net(X_sub, gene_names, par)
net['cell_type'] = group
if i==0:
grn = net
else:
grn = pd.concat([grn, net], axis=0).reset_index(drop=True)
i += 1
else:
grn = corr_net(X, gene_names, par)
return grn
print('Read data')
multiomics_rna = ad.read_h5ad(par["multiomics_rna"])

process_data(multiomics_rna)
gene_names = multiomics_rna.var_names.to_numpy()
tf_all = np.loadtxt(par['tf_all'], dtype=str)
groups = multiomics_rna.obs.cell_type
tf_all = np.intersect1d(tf_all, gene_names)
if par['normalize']:
print('Noramlize data')
sc.pp.normalize_total(multiomics_rna)
sc.pp.log1p(multiomics_rna)
sc.pp.scale(multiomics_rna)
n_tfs = len(tf_all)


print('Create corr net')
net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par)
net = create_corr_net(multiomics_rna.X, multiomics_rna.var_names, groups, par, tf_all=None)

print('Output GRN')
net.to_csv(par['prediction'])
5 changes: 4 additions & 1 deletion src/methods/single_omics/genie3/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import anndata
import numpy as np
import pandas as pd
import scanpy as sc
from arboreto.algo import genie3
from distributed import Client

Expand All @@ -12,7 +13,8 @@
'multiomics_rna': 'resources/grn-benchmark/multiomics_rna_0.h5ad',
"tf_all": 'resources/prior/tf_all.csv',
'prediction': 'output//genie3_donor0.csv',
'max_n_links': 50000
'max_n_links': 50000,
'only_hvgs': True
}
## VIASH END

Expand All @@ -27,6 +29,7 @@
tfs = set(list(df['gene_name']))
tf_names = [gene_name for gene_name in gene_names if (gene_name in tfs)]


print('GRN inference')
client = Client(processes=False)
network = genie3(X, client_or_address=client, gene_names=gene_names, tf_names=tf_names)
Expand Down
55 changes: 55 additions & 0 deletions src/utils/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import pandas as pd
import anndata as ad
import numpy as np
from tqdm import tqdm
import scanpy as sc

def process_links(net, par):
net = net[net.source!=net.target]
net_sorted = net.reindex(net['weight'].abs().sort_values(ascending=False).index)
net = net_sorted.head(par['max_n_links']).reset_index(drop=True)
return net

def corr_net(X, gene_names, par, tf_all=None):
X = StandardScaler().fit_transform(X)
net = np.dot(X.T, X) / X.shape[0]
net = pd.DataFrame(net, index=gene_names, columns=gene_names)
if tf_all is None:
net = net.sample(n_tfs, axis=1, random_state=par['seed'])
else:
net = net[tf_all]
net = net.reset_index()
index_name = net.columns[0]
net = net.melt(id_vars=index_name, var_name='source', value_name='weight')

net.rename(columns={index_name: 'target'}, inplace=True)
net = process_links(net, par)

return net

def create_corr_net(X, gene_names, groups, par, tf_all):
if par['cell_type_specific']:
i = 0
for group in tqdm(np.unique(groups), desc="Processing groups"):
X_sub = X[groups == group, :]
net = corr_net(X_sub, gene_names, par)
net['cell_type'] = group
if i==0:
grn = net
else:
grn = pd.concat([grn, net], axis=0).reset_index(drop=True)
i += 1
else:
grn = corr_net(X, gene_names, par)
return grn
def process_data(adata, par):
if par['normalize']:
print('Noramlize data')
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.scale(adata)
adata.X = adata.X.todense()
if par['only_hvgs']:
print('Subsetting data to hvgs')
adata = adata[:, adata.var.hvg_counts]
print('New dimension of data: ', adata.shape)

0 comments on commit 79a6f65

Please sign in to comment.