scgpt's preprocessing is omited

openproblems-bio · Dec 1, 2024 · af362dc · af362dc
1 parent 734712f
commit af362dc
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 46 deletions.
diff --git a/runs.ipynb b/runs.ipynb
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -79,29 +79,6 @@
     "datasets = ['op', 'replogle2', 'nakatake', 'norman', 'adamson']"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "AnnData object with n_obs × n_vars = 25551 × 22787\n",
-       "    obs: 'cell_type', 'donor_id'\n",
-       "    var: 'gene_ids', 'interval'\n",
-       "    layers: 'X_norm', 'counts'"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "ad.read('resources/inference_datasets/op_rna.h5ad')"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -4194,6 +4171,23 @@
     "from src.helper import analyse_meta_cells\n",
     "analyse_meta_cells(task_grn_inference_dir='./')"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Effect of imputation on performance"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from src.helper import analyse_imputation\n",
+    "analyse_imputation(task_grn_inference_dir='./')\n"
+   ]
   }
  ],
  "metadata": {

diff --git a/src/helper.py b/src/helper.py
@@ -117,7 +117,86 @@ def analyse_meta_cells(task_grn_inference_dir):
             scores_all = pd.concat([scores_all, scores], axis=0)
         print(scores_all)
         scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv")
-
+
+def analyse_imputation(task_grn_inference_dir):
+    dataset = 'op' #'nakatake' #op', norman
+
+
+    par = {
+        'rna': f'{task_grn_inference_dir}/resources/inference_datasets/{dataset}_rna.h5ad',
+        "evaluation_data": f"{task_grn_inference_dir}/resources/evaluation_datasets/{dataset}_perturbation.h5ad",
+
+        'layer': 'X_norm',
+        'consensus':  f'{task_grn_inference_dir}/resources/prior/{dataset}_consensus-num-regulators.json',
+        'tf_all': f'{task_grn_inference_dir}/resources/prior/tf_all.csv',
+        'apply_tf': True,
+        'apply_skeleton': False,
+        'verbose': 2,
+        'max_n_links': 50_000,
+        'temp_dir': 'output/imputation/',
+        'subsample': -1,
+        'num_workers': 4,
+        'binarize': True,
+        'reg_type': 'ridge'
+    }
+
+    os.makedirs(par['temp_dir'], exist_ok=True)
+    # - imports 
+    sys.path.append(f'{task_grn_inference_dir}/')
+    sys.path.append(f'{task_grn_inference_dir}/src/utils')
+
+    from src.metrics.regression_1.main import main as main_reg1
+    from src.metrics.regression_2.main import main as main_reg2
+
+    from util import corr_net
+
+    # - read inputs and cluster with differen resolutions
+    rna = ad.read_h5ad(par['rna'])
+    rna.X = rna.layers[par['layer']]
+
+    sc.pp.pca(rna)
+    sc.pp.neighbors(rna)
+
+    X_original = rna.X.todense().A
+    # - pseudobulkd and run per res
+    gene_names = rna.var_names
+    for i_run, method in enumerate(['knn', 'softimpute', -1]): 
+        if method == -1:
+            X = X_original
+        elif method == 'knn':
+            from sklearn.impute import KNNImputer
+            knn_imputer = KNNImputer(n_neighbors=10) 
+            X = knn_imputer.fit_transform(X_original)
+        elif method == 'softimpute':
+            from fancyimpute import SoftImpute
+            X = SoftImpute().fit_transform(X_original)
+        elif method == 'magic':
+            import scprep
+            import magic
+
+            magic_operator = magic.MAGIC()
+            X = magic_operator.fit_transform(X_original)
+        else:
+            raise ValueError('define first')
+        # - run corr 
+        net = corr_net(X, gene_names, par)
+        par['prediction'] = f"{par['temp_dir']}/net_{method}.csv"
+        net.to_csv(par['prediction'])
+
+        # - regression 1 and 2
+        scores_reg1 = main_reg1(par)
+
+        scores_reg2 = main_reg2(par)
+
+        scores = pd.concat([scores_reg1, scores_reg2], axis=1)
+        scores.index = [method]
+
+        if i_run == 0:
+            scores_all = scores
+        else:
+            scores_all = pd.concat([scores_all, scores], axis=0)
+        print(scores_all)
+        scores_all.to_csv(f"{par['temp_dir']}/scores_all.csv")
 def check_scores(par):
     df_scores = pd.read_csv(f"resources/scores/scgen_pearson-ridge.csv", index_col=0)
     df_all_n = (df_scores-df_scores.min(axis=0))/(df_scores.max(axis=0)-df_scores.min(axis=0))
@@ -372,4 +451,6 @@ def reformat_data(df_local):
 if __name__ == '__main__':
     # run_grn_inference()
     # calculate_scores()
-    analyse_meta_cells(task_grn_inference_dir='./')
+    # analyse_meta_cells(task_grn_inference_dir='./')
+    analyse_imputation(task_grn_inference_dir='./')
+
diff --git a/src/methods/single_omics/scgpt/script.py b/src/methods/single_omics/scgpt/script.py
@@ -172,26 +172,26 @@ def monitor_memory():
 adata.X = adata.X.todense()
 adata.obs["celltype"] = adata.obs["cell_type"].astype("category")
 adata.obs["str_batch"] = adata.obs["donor_id"].astype(str)
-data_is_raw = False
+data_is_raw = True
 
 adata.var["id_in_vocab"] = [1 if gene in vocab else -1 for gene in adata.var.index]
 gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
 adata = adata[:, adata.var["id_in_vocab"] >= 0]
 
-preprocessor = Preprocessor(
-    use_key="X",  # the key in adata.layers to use as raw data
-    filter_gene_by_counts=3,  # step 1
-    filter_cell_by_counts=False,  # step 2
-    normalize_total=1e4,  # 3. whether to normalize the raw data and to what sum
-    result_normed_key="X_normed",  # the key in adata.layers to store the normalized data
-    log1p=data_is_raw,  # 4. whether to log1p the normalized data
-    result_log1p_key="X_log1p",
-    subset_hvg= False,  # 5. whether to subset the raw data to highly variable genes
-    hvg_flavor="seurat_v3" if data_is_raw else "cell_ranger",
-    binning=n_input_bins,  # 6. whether to bin the raw data and to what number of bins
-    result_binned_key="X_binned",  # the key in adata.layers to store the binned data
-)
-preprocessor(adata, batch_key="str_batch")
+# preprocessor = Preprocessor(
+#     use_key="counts",  # the key in adata.layers to use as raw data
+#     filter_gene_by_counts=3,  # step 1
+#     filter_cell_by_counts=False,  # step 2
+#     normalize_total=1e4,  # 3. whether to normalize the raw data and to what sum
+#     result_normed_key="X_normed",  # the key in adata.layers to store the normalized data
+#     log1p=data_is_raw,  # 4. whether to log1p the normalized data
+#     result_log1p_key="X_log1p",
+#     subset_hvg= False,  # 5. whether to subset the raw data to highly variable genes
+#     hvg_flavor="seurat_v3" if data_is_raw else "cell_ranger",
+#     binning=n_input_bins,  # 6. whether to bin the raw data and to what number of bins
+#     result_binned_key="X_binned",  # the key in adata.layers to store the binned data
+# )
+# preprocessor(adata, batch_key="str_batch")
 
 # print('Subsetting to HVGs')
 # sc.pp.highly_variable_genes(
@@ -205,7 +205,8 @@ def monitor_memory():
 # adata = adata[:, adata.var["highly_variable"]].copy()
 
 
-input_layer_key = "X_binned"
+
+input_layer_key = "X_norm"
 all_counts = (
     adata.layers[input_layer_key].A
     if issparse(adata.layers[input_layer_key])

diff --git a/src/metrics/regression_1/main.py b/src/metrics/regression_1/main.py
@@ -283,7 +283,10 @@ def main(par):
 
     if par['apply_skeleton']: #apply skeleton
         print('Before filtering with skeleton:', net.shape)
-        skeleton = np.loadtxt(par['skeleton'], dtype=str)
+        skeleton = pd.read_csv(par['skeleton'])
+        skeleton['link'] = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str)
+        skeleton = skeleton['link'].values.flatten()
+
         net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
         net = net[net['link'].isin(skeleton)]
         print('After filtering with skeleton:', net.shape)

diff --git a/src/metrics/regression_2/main.py b/src/metrics/regression_2/main.py
@@ -27,7 +27,10 @@ def net_to_matrix(net, gene_names: np.ndarray, par: Dict[str, Any]) -> np.ndarra
         raise ValueError('Fix this')
     if par['apply_skeleton']: #apply skeleton
         print('Before filtering with skeleton:', net.shape)
-        skeleton = np.loadtxt(par['skeleton'], dtype=str)
+        skeleton = pd.read_csv(par['skeleton'])
+        skeleton['link'] = skeleton['source'].astype(str) + '_' + skeleton['target'].astype(str)
+        skeleton = skeleton['link'].values.flatten()
+
         net['link'] = net['source'].astype(str) + '_' + net['target'].astype(str)
         net = net[net['link'].isin(skeleton)]
         print('After filtering with skeleton:', net.shape)

diff --git a/src/robustness_analysis/script_all.py b/src/robustness_analysis/script_all.py
@@ -11,8 +11,8 @@
   'write_dir': "resources/results/robustness_analysis",
   'degrees': [0, 10, 20, 50, 100],
   # 'degrees': [50],
-  'noise_types': ["net", "sign"],
-  # 'noise_types': ['weight'],
+  # 'noise_types': ["net", "sign", 'weight'],
+  'noise_types': ['weight'],
   'methods': ['negative_control', 'positive_control', 'pearson_corr', 'portia', 'ppcor', 'grnboost2', 'scenic', 'granie', 'scglue', 'celloracle', 'figr', 'scenicplus'],
 
   "evaluation_data": "resources/evaluation_datasets/op_perturbation.h5ad",