Skip to content

Commit

Permalink
scgpt's preprocessing is omited
Browse files Browse the repository at this point in the history
  • Loading branch information
janursa committed Dec 1, 2024
1 parent 734712f commit af362dc
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 46 deletions.
42 changes: 18 additions & 24 deletions runs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -79,29 +79,6 @@
"datasets = ['op', 'replogle2', 'nakatake', 'norman', 'adamson']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"AnnData object with n_obs × n_vars = 25551 × 22787\n",
" obs: 'cell_type', 'donor_id'\n",
" var: 'gene_ids', 'interval'\n",
" layers: 'X_norm', 'counts'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ad.read('resources/inference_datasets/op_rna.h5ad')"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down Expand Up @@ -4194,6 +4171,23 @@
"from src.helper import analyse_meta_cells\n",
"analyse_meta_cells(task_grn_inference_dir='./')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Effect of imputation on performance"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from src.helper import analyse_imputation\n",
"analyse_imputation(task_grn_inference_dir='./')\n"
]
}
],
"metadata": {
Expand Down
85 changes: 83 additions & 2 deletions src/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,86 @@ def analyse_meta_cells(task_grn_inference_dir):
scores_all = pd.concat([scores_all, scores], axis=0)
print(scores_all)
scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv")


def analyse_imputation(task_grn_inference_dir):
dataset = 'op' #'nakatake' #op', norman


par = {
'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad',
"evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad",

'layer': 'X_norm',
'consensus': f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json',
'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv',
'apply_tf': True,
'apply_skeleton': False,
'verbose': 2,
'max_n_links': 50_000,
'temp_dir': 'output/imputation/',
'subsample': -1,
'num_workers': 4,
'binarize': True,
'reg_type': 'ridge'
}

os.makedirs(par['temp_dir'], exist_ok=True)
# - imports
sys.path.append(f'{task_grn_inference_dir}/')
sys.path.append(f'{task_grn_inference_dir}/src/utils')

from src.metrics.regression_1.main import main as main_reg1
from src.metrics.regression_2.main import main as main_reg2

from util import corr_net

# - read inputs and cluster with differen resolutions
rna = ad.read_h5ad(par['rna'])
rna.X = rna.layers[par['layer']]

sc.pp.pca(rna)
sc.pp.neighbors(rna)

X_original = rna.X.todense().A
# - pseudobulkd and run per res
gene_names = rna.var_names
for i_run, method in enumerate(['knn', 'softimpute', -1]):
if method == -1:
X = X_original
elif method == 'knn':
from sklearn.impute import KNNImputer
knn_imputer = KNNImputer(n_neighbors=10)
X = knn_imputer.fit_transform(X_original)
elif method == 'softimpute':
from fancyimpute import SoftImpute
X = SoftImpute().fit_transform(X_original)
elif method == 'magic':
import scprep
import magic

magic_operator = magic.MAGIC()
X = magic_operator.fit_transform(X_original)
else:
raise ValueError('define first')
# - run corr
net = corr_net(X, gene_names, par)
par['prediction'] = f"{par['temp_dir']}/net_{method}.csv"
net.to_csv(par['prediction'])

# - regression 1 and 2
scores_reg1 = main_reg1(par)

scores_reg2 = main_reg2(par)

scores = pd.concat([scores_reg1, scores_reg2], axis=1)
scores.index = [method]

if i_run == 0:
scores_all = scores
else:
scores_all = pd.concat([scores_all, scores], axis=0)
print(scores_all)
scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv")
def check_scores(par):
df_scores = pd.read_csv(f"resources/scores/scgen_pearson-ridge.csv", index_col=0)
df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))
Expand Down Expand Up @@ -372,4 +451,6 @@ def reformat_data(df_local):
if __name__ == '__main__':
# run_grn_inference()
# calculate_scores()
analyse_meta_cells(task_grn_inference_dir='./')
# analyse_meta_cells(task_grn_inference_dir='./')
analyse_imputation(task_grn_inference_dir='./')

33 changes: 17 additions & 16 deletions src/methods/single_omics/scgpt/script.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,26 +172,26 @@ def monitor_memory():
adata.X = adata.X.todense()
adata.obs["celltype"] = adata.obs["cell_type"].astype("category")
adata.obs["str_batch"] = adata.obs["donor_id"].astype(str)
data_is_raw = False
data_is_raw = True

adata.var["id_in_vocab"] = [1 if gene in vocab else -1 for gene in adata.var.index]
gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
adata = adata[:, adata.var["id_in_vocab"] >= 0]

preprocessor = Preprocessor(
use_key="X", # the key in adata.layers to use as raw data
filter_gene_by_counts=3, # step 1
filter_cell_by_counts=False, # step 2
normalize_total=1e4, # 3. whether to normalize the raw data and to what sum
result_normed_key="X_normed", # the key in adata.layers to store the normalized data
log1p=data_is_raw, # 4. whether to log1p the normalized data
result_log1p_key="X_log1p",
subset_hvg= False, # 5. whether to subset the raw data to highly variable genes
hvg_flavor="seurat_v3" if data_is_raw else "cell_ranger",
binning=n_input_bins, # 6. whether to bin the raw data and to what number of bins
result_binned_key="X_binned", # the key in adata.layers to store the binned data
)
preprocessor(adata, batch_key="str_batch")
# preprocessor = Preprocessor(
# use_key="counts", # the key in adata.layers to use as raw data
# filter_gene_by_counts=3, # step 1
# filter_cell_by_counts=False, # step 2
# normalize_total=1e4, # 3. whether to normalize the raw data and to what sum
# result_normed_key="X_normed", # the key in adata.layers to store the normalized data
# log1p=data_is_raw, # 4. whether to log1p the normalized data
# result_log1p_key="X_log1p",
# subset_hvg= False, # 5. whether to subset the raw data to highly variable genes
# hvg_flavor="seurat_v3" if data_is_raw else "cell_ranger",
# binning=n_input_bins, # 6. whether to bin the raw data and to what number of bins
# result_binned_key="X_binned", # the key in adata.layers to store the binned data
# )
# preprocessor(adata, batch_key="str_batch")

# print('Subsetting to HVGs')
# sc.pp.highly_variable_genes(
Expand All @@ -205,7 +205,8 @@ def monitor_memory():
# adata = adata[:, adata.var["highly_variable"]].copy()


input_layer_key = "X_binned"

input_layer_key = "X_norm"
all_counts = (
adata.layers[input_layer_key].A
if issparse(adata.layers[input_layer_key])
Expand Down
5 changes: 4 additions & 1 deletion src/metrics/regression_1/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,10 @@ def main(par):

if par['apply_skeleton']: #apply skeleton
print('Before filtering with skeleton:', net.shape)
skeleton = np.loadtxt(par['skeleton'], dtype=str)
skeleton = pd.read_csv(par['skeleton'])
skeleton['link'] = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str)
skeleton = skeleton['link'].values.flatten()

net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
net = net[net['link'].isin(skeleton)]
print('After filtering with skeleton:', net.shape)
Expand Down
5 changes: 4 additions & 1 deletion src/metrics/regression_2/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,10 @@ def net_to_matrix(net, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarra
raise ValueError('Fix this')
if par['apply_skeleton']: #apply skeleton
print('Before filtering with skeleton:', net.shape)
skeleton = np.loadtxt(par['skeleton'], dtype=str)
skeleton = pd.read_csv(par['skeleton'])
skeleton['link'] = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str)
skeleton = skeleton['link'].values.flatten()

net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
net = net[net['link'].isin(skeleton)]
print('After filtering with skeleton:', net.shape)
Expand Down
4 changes: 2 additions & 2 deletions src/robustness_analysis/script_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
'write_dir': "resources/results/robustness_analysis",
'degrees': [0, 10, 20, 50, 100],
# 'degrees': [50],
'noise_types': ["net", "sign"],
# 'noise_types': ['weight'],
# 'noise_types': ["net", "sign", 'weight'],
'noise_types': ['weight'],
'methods': ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],

"evaluation_data": "resources/evaluation_datasets/op_perturbation.h5ad",
Expand Down

0 comments on commit af362dc

Please sign in to comment.