From b74d783f82031274e6fd8bad685b8caa4bed1eca Mon Sep 17 00:00:00 2001 From: Elena Buscaroli Date: Thu, 23 Mar 2023 15:02:56 +0100 Subject: [PATCH] updating differentiation counts functions --- .DS_Store | Bin 8196 -> 0 bytes .gitignore | 1 + R/plots_differentiation_tree.R | 14 +++-- R/utils_differentiation.R | 112 +++++++++++++++++++++++++-------- R/utils_mullerplot.R | 2 +- 5 files changed, 97 insertions(+), 32 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index add3505cefdd82e04d16099956c80b2f510416c9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHMU2GIp6u#fIz>FQ}6e-;fTUe=r;)ZU4LPfxB%WnaT-O@kMy1O&bk?Bm`ncYQ5 zV+lqh`e6Kf#b`{_2jXvHqGB{^OpGy_hD7{{(U|z;6N#dU@!Ywygcf+x7}UAR+%xCi zd+xpGoA1osGs_r5XVGY7EXEj<>EcqWq3SY)_w(zLB84s0BtiDf&C%VPxpCX_{7Xhh z4G{tn0ucfc0ucfc0#^e9bY}A+U*X&ry-^(@5FzltM1a2^5_EBy4&|hf!K;I+AO#@F zQ-Gk*y~b}yCPJAG<)o0*Lj}T=q%cL8VnCQvdmqFhfDGI?bO5 zhBKsvjOqx12!Trx5V?DAuI!EH?2GgFRO$?*s_I#@=g4#98u`x5s9VnXS-3nL|f41I)2u2%tByb872+cKVUhATkdp=j^PEi8POptvNEE@jvigL zt}U5Z)!IIhOdM@r)0#}Qtxio$C~|#s%dNd5hwL%OJs~!a@CCr;&T!@t(!qLduhbsh|#IzxlLdDyQ^G_7lJ*mDbhS1{R#rWHN2*QB|u zlDU~?q^aif)q2MHG+w`1@JKeNQ^GOj)VzlHbvLYNS#!(A&aK_IPt|I5^HsHe;Xujq ztekBQWxc%V7?yKzZ`m^Z(P6^W@vQ@vSx}TZB>SA~m^n4SPOFVI>iS?YXWd)0{QI&(g)ZzK{2G6-=s)&|ukeUG4J-1T7m5yg8|=eTV4b zYMvrYQ?07L*UsilTQd}~PTN{lA1Dd$w4Gt&`g&C#%)4V_v}G@B+jN_*-^G{RmG%6d zanqqKeH^acMSa8GXtrSXl!;tn>o#f)9v`)Hqb-KzyOqwJ>YNReagQfiUfr4FfE>XG({y-Sra8lLWxlqFq+QTRfTPF+5X;@h_G=#KBY z>ci;EdHlIwyP$qyLxM+B`^Jt7#1l-dkn>{72Rxq^9Rbe+9?anA6ouEKc$2!M5||)th7r4Q;kilkt@@1*0Yfs@fV~tx(8mRYFb1 z*D10Fp{!RoC=`5J_0%H0F}6u^oxiId&eiQ3DN2 za3fM!i!`=k8*zCjcB3By7$QbRz=L=Q4->DSAZBx3KZ9rS z9G=H1yo^`yD&E4|IE{Dl9x?qBOyL`Rj~{06clUJu;+)`bDN}H4*Etwy(}zS_h(MK) zF`oZ#x$^h_2(Ji%2!a1F0;t}c+1yDER(=!A^p)q@3A*m4ix)m_QpmuBsvsRF8KmPR oFZ^Lh_X!%xZPKBf6p~t~{No=2qVu1_{+E8)M(00=@ZVkj1dxtLbpQYW diff --git a/.gitignore b/.gitignore index dde4f5b..977ba25 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ plots inst/doc **._* **.RData +**.DS_Store diff --git a/R/plots_differentiation_tree.R b/R/plots_differentiation_tree.R index 384f4dd..6bbf5c8 100644 --- a/R/plots_differentiation_tree.R +++ b/R/plots_differentiation_tree.R @@ -26,9 +26,10 @@ plot_differentiation_tree = function(x, if (single_tree || length(highlight)==1) return( - util_plot_diff(mrca.list=get_mrca_df(x, highlight=highlight, edges=edges, tps=timepoints), + util_plot_diff(mrca.list=get_mrca_df(x, highlight=highlight, edges=edges, + tps=timepoints, time_spec=!single_tree), edges=edges, - timepoints=timepoints, + # timepoints=timepoints, cls=cls) ) @@ -48,7 +49,7 @@ plot_differentiation_tree = function(x, util_plot_diff = function(cls, mrca.list, edges, - timepoints, + # timepoints, cex=1, node_palette=colorRampPalette(RColorBrewer::brewer.pal(n=9, "Set1"))) { if (cls != "") { @@ -108,9 +109,14 @@ util_plot_diff = function(cls, theme_void(base_size=9*cex) + theme(legend.position="right", text=element_text(size=9)) + guides(color="none") + - labs(title=paste0(cls, "Timepoints ", paste0(timepoints, collapse=","))) + + # labs(title=paste0(cls, "Timepoints ", paste0(timepoints, collapse=","))) + coord_fixed(0.4) + theme(plot.margin=margin()) + if ("Generation" %in% colnames(df)) + pp = pp + ggraph::facet_edges(Generation ~ vec_gt, as.table=T, ncol=4) + else + pp = pp + ggraph::facet_edges( ~ vec_gt, as.table=T, ncol=4) + return(pp) } diff --git a/R/utils_differentiation.R b/R/utils_differentiation.R index c026510..2544e3b 100644 --- a/R/utils_differentiation.R +++ b/R/utils_differentiation.R @@ -1,4 +1,5 @@ -get_mrca_df = function(x, highlight, edges, tps=c(), clonal=F) { +get_mrca_df = function(x, highlight, edges, tps=c(), time_spec=F, clonal=F, thr=1) { + if (length(tps)>0) time_spec = T if (purrr::is_empty(tps)) tps = x %>% get_timepoints() @@ -8,23 +9,32 @@ get_mrca_df = function(x, highlight, edges, tps=c(), clonal=F) { dplyr::filter(labels %in% highlight) %>% dplyr::filter(timepoints %in% tps) %>% dplyr::rename(cluster=labels) %>% + dplyr::mutate(cluster=as.character(cluster)) %>% + dplyr::group_by(cluster, lineage) %>% - dplyr::summarise(is.present=any(mean_cov > 0), .groups="keep") %>% + dplyr::mutate(is.present=any(mean_cov > thr)) %>% dplyr::ungroup() %>% - dplyr::rename(Identity=lineage) %>% + + dplyr::rename(Identity=lineage, Generation=timepoints, Population=mean_cov) %>% + + convert_tp(get_tp_to_int(x)) %>% + dplyr::inner_join(edges, by="Identity") else if (have_muts_fit(x)) - fracs = x %>% - get_vaf_dataframe() %>% - dplyr::filter(labels %in% highlight) %>% - dplyr::filter(timepoints %in% tps) %>% - dplyr::select(labels_mut, theta_binom, lineage, timepoints) %>% - dplyr::rename(cluster=labels_mut) %>% - # is.present stores whether the cluster has been observed in the cluster - dplyr::group_by(cluster, lineage) %>% - dplyr::summarise(is.present=any(theta_binom > 0), .groups="keep") %>% + fracs = x %>% + get_pop_df() %>% + dplyr::filter(Parent %in% highlight) %>% + dplyr::filter(Generation %in% get_tp_to_int(x)[tps]) %>% + # convert_tp(setNames(names(get_tp_to_int(x)), get_tp_to_int(x))) %>% + dplyr::select(Identity, Population, Lineage, Generation) %>% + dplyr::rename(cluster=Identity) %>% + + # is.present stores whether the cluster has been observed in the cluster in at least one tp + dplyr::group_by(cluster, Lineage) %>% + dplyr::mutate(is.present=any(Population > thr)) %>% dplyr::ungroup() %>% - dplyr::rename(Identity=lineage) %>% + + dplyr::rename(Identity=Lineage) %>% dplyr::inner_join(edges, by="Identity") else fracs = data.frame() @@ -32,25 +42,72 @@ get_mrca_df = function(x, highlight, edges, tps=c(), clonal=F) { return(NULL) orig = fracs %>% + + # check for each cluster if is present in all descendants of the parent + # if so, dplyr::rowwise() %>% - dplyr::mutate(is.present.parent=is_present_desc(cluster, Parent, edges, fracs) ) %>% # if is present in all descendants of the parent + dplyr::mutate(is.present.parent=is_present_desc(cluster, Generation, Parent, edges, fracs)) %>% dplyr::ungroup() %>% + # !! cannot filter now otherwise retrieving the mrca won't work + # dplyr::filter(is.present, Population > thr) %>% + # orig stores the node of differentiation tree where the cluster has originated # it will stay NA if the it's never observed in the lineage - dplyr::mutate(orig=ifelse( (is.present & !is.present.parent), Identity, NA)) %>% - dplyr::mutate(orig=ifelse( (is.na(orig) & (is.present & is.present.parent)), Parent, orig )) + dplyr::group_by(Generation) %>% + dplyr::mutate(orig.node=ifelse( (is.present & !is.present.parent), Identity, NA)) %>% + dplyr::mutate(orig.node=ifelse( (is.na(orig.node) & (is.present & is.present.parent)), Parent, orig.node )) %>% + dplyr::ungroup() %>% - mrca.list = lapply(unique(orig$cluster), get_mrca_list, edges=edges, orig=orig) %>% - setNames(nm=unique(orig$cluster)) + dplyr::select(-dplyr::contains("is.present")) %>% + + dplyr::group_by(Generation, cluster) %>% + dplyr::mutate(orig.node=ifelse(Generation==Generation & cluster==cluster, + get_mrca_list(unique(cluster), edges, + dplyr::filter(., cluster==unique(cluster),Generation==unique(Generation))), + orig.node)) %>% + dplyr::ungroup() %>% - if (all(mrca.list %>% unlist() %>% unique() %>% is.na())) + dplyr::select(-Identity, -Parent) %>% + dplyr::rename(Identity=orig.node) + + # mrca.list = lapply(unique(orig$Generation), function(gen) + # lapply(unique(orig$cluster), + # function(cls) { + # mrca = get_mrca_list(cls, edges, orig %>% dplyr::filter(Generation==gen)) + # orig %>% dplyr::mutate(orig=ifelse(Generation==gen & cluster==cls, mrca, orig)) + # } ) ) %>% + # # setNames(nm=unique(orig$cluster)) %>% unlist() ) %>% + # data.frame() %>% tibble::rownames_to_column() %>% + # setNames(c("cluster",unique(orig$Generation))) %>% + # reshape2::melt(id="cluster", value.name="Identity", variable.name="Generation") %>% + # dplyr::filter(!is.na(Identity)) %>% + # dplyr::inner_join(orig %>% dplyr::select(cluster,Generation,Population) %>% unique()) + + # if (all(mrca.list %>% unlist() %>% unique() %>% is.na())) + if (nrow(orig)==0) return(NULL) + if (time_spec) + return( + orig %>% + dplyr::filter(Population>=thr) %>% + dplyr::select(-Population) %>% unique() %>% + + dplyr::inner_join(edges, by="Identity") %>% + dplyr::rename(mrca.from=Parent, mrca.to=Identity) %>% + + dplyr::group_by(mrca.to, Generation) %>% + dplyr::mutate(n_clones=length(cluster), cluster=paste(cluster, collapse=", ")) %>% + dplyr::ungroup() %>% + + unique() + ) + return( - mrca.list %>% - data.frame() %>% t() %>% as.data.frame() %>% - tibble::rownames_to_column() %>% setNames(c("cluster","Identity")) %>% + orig %>% + dplyr::filter(Population>=thr) %>% + dplyr::select(-Generation, -Population) %>% unique() %>% dplyr::inner_join(edges, by="Identity") %>% dplyr::rename(mrca.from=Parent, mrca.to=Identity) %>% @@ -69,7 +126,7 @@ get_mrca_list = function(cls, edges, orig) { # orig is a dataframe reporting where each cluster has been observed # the function's purpose is to return the MRCA of "cls" - nodes = orig %>% filter(cluster==cls) %>% filter(!is.na(orig)) %>% dplyr::pull(orig) + nodes = orig %>% filter(cluster==cls) %>% filter(!is.na(orig.node)) %>% dplyr::pull(orig.node) %>% unique() root = get_root(edges) if (length(unique(nodes)) == 1) return(nodes %>% unique()) @@ -104,14 +161,15 @@ get_mrca = function(edges, n1, n2) { } -is_present_desc = function(node, parent, edges, fracs) { +is_present_desc = function(cls, tp, parent, edges, fracs) { + desc = get_desc_list(edges)[[parent]] - # check for each descendant whether cluster "node" is observed in lineage "dd" + # check for each descendant whether cluster "cluster" is observed in lineage "dd" -> Identity contains the "mrca.to" for (dd in desc) { if ( dd %in% fracs$Identity && - !(fracs %>% dplyr::filter(cluster==node, Identity==dd) %>% dplyr::pull(is.present)) ) - return(FALSE) + !(fracs %>% dplyr::filter(cluster==cls, Generation==tp, Identity==dd) %>% dplyr::pull(is.present)) ) + return(FALSE) } return(TRUE) } diff --git a/R/utils_mullerplot.R b/R/utils_mullerplot.R index b7eedc0..605f52a 100644 --- a/R/utils_mullerplot.R +++ b/R/utils_mullerplot.R @@ -69,7 +69,7 @@ add_time_0 = function(pop_df, convert_tp = function(pop_df, timepoints_to_int) { # if (is.null(timepoints_to_int)) return(pop_df) - if(is.numeric(pop_df$Generation)) pop_df = pop_df %>% dplyr::mutate(Generation=as.character(Generation)) + pop_df = pop_df %>% dplyr::mutate(Generation=as.character(Generation)) return( pop_df %>% dplyr::mutate(Generation=timepoints_to_int[Generation])