full intersect of markers vs rank_genes_groups

Teichlab · Feb 29, 2024 · c34cae9 · c34cae9
1 parent 20d782f
commit c34cae9
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 2 deletions.
diff --git a/sctk/__init__.py b/sctk/__init__.py
@@ -29,6 +29,7 @@
     volcano_plot,
     calc_marker_stats,
     filter_marker_stats,
+    intersect_with_rank_genes_groups,
     top_markers,
     test_markers,
 )

diff --git a/sctk/_markers.py b/sctk/_markers.py
@@ -96,6 +96,7 @@ def draw(
             plt.show()
 
 
+#this is meant to be ran on log1p normalised data
 def calc_marker_stats(ad, groupby, genes=None, use_rep="raw", inplace=False, partial=False):
     if ad.obs[groupby].dtype.name != "category":
         raise ValueError('"%s" is not categorical' % groupby)
@@ -233,12 +234,31 @@ def filter_marker_stats(
             ["top_frac_group", "mean_diff", "frac_diff"], ascending=[True, False, False]
         )
     filtered["top_frac_group"] = filtered["top_frac_group"].astype("category")
-    filtered["top_frac_group"].cat.reorder_categories(
-        list(stats_df["top_frac_group"].cat.categories), inplace=True
+    filtered["top_frac_group"] = filtered["top_frac_group"].cat.reorder_categories(
+        list(stats_df["top_frac_group"].cat.categories)
     )
     return filtered
 
 
+def intersect_with_rank_genes_groups(mks, adata, pvals_adj_thresh=0.05):
+    #needs sc.tl.rank_genes_groups() ran on the adata
+    #retrieve full rank_genes_groups results space
+    rgg = sc.get.rank_genes_groups_df(adata, group=None)
+    #subset to overexpressed markers for cluster
+    #pvals_adj below threshold, and positive logfoldchanges
+    rgg = rgg.loc[rgg['pvals_adj'] < pvals_adj_thresh, :]
+    rgg = rgg.loc[rgg['logfoldchanges'] > 0, :]
+    #get a master list of markers called by both methods
+    #reported as GENE_CLUSTER
+    sctkm = [i+"_"+j for i,j in zip(mks.index, mks['top_frac_group'])]
+    rggm = [i+"_"+j for i,j in zip(rgg['names'], rgg['group'])]
+    #which of the sctk markers are in the rank_genes_groups markers?
+    mask = np.isin(sctkm, rggm)
+    #subset and return sctk marker list
+    mks = mks.loc[mask, :]
+    return mks
+
+
 def top_markers(df, top_n=5, groupby="top_frac_group"):
     return df.groupby(groupby).head(top_n).index.to_list()