From 1ab916bf50ac41246d576884fb6e58ca4f9f957c Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 8 Jun 2021 16:02:30 +0200 Subject: [PATCH 01/50] percentage on mgnify --- grimer/grimer.py | 2 +- grimer/plots.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/grimer/grimer.py b/grimer/grimer.py index 337fc2e..a5032e2 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -31,7 +31,7 @@ def main(): default_rank_name = "default" - version = "1.0.0-alpha0" + version = "1.0.0-alpha1" parser = argparse.ArgumentParser(description='grimer') parser.add_argument('-i', '--input-file', required=True, type=str, help="Main input table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --tranpose if your file is reversed). First column and first row are used as headers.") parser.add_argument('-c', '--config', required=True, type=str, help="Configuration file") diff --git a/grimer/plots.py b/grimer/plots.py index 9ee5481..9a53835 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -418,6 +418,7 @@ def plot_decontam_widgets(): "pvalue_input": pvalue_input, "help_button": help_button(title="DECONTAM", text=help_text, align="start")} + def plot_mgnify(cds_p_mgnify): mgnify_fig = figure(height=150, width=300, tools="save,wheel_zoom,reset") @@ -439,16 +440,17 @@ def plot_mgnify(cds_p_mgnify): factors.extend(lineages) palette.extend(make_color_palette(len(lineages))) - # Custom hover to follow mouse + # Add custom tooltip to show percentage (based on angle) mgnify_fig.add_tools(HoverTool( tooltips=[("Biome", "@lineage"), - ("studies", "@count")], + ("Studies", "@count"), + ("Percentage", "@angle{custom}%")], mode="mouse", - point_policy="follow_mouse" + point_policy="follow_mouse", + formatters={"@angle": CustomJSHover(code="return ((value/6.2831853071795)*100).toFixed(2);")} )) #mgnify_fig.text(0, 1, text=["No data"], text_baseline="middle", text_align="center") - mgnify_fig.wedge(x=0, y=1, radius=0.5, start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'), From 1f33bef626bab14f2a35e43a542b790b7c0a7f87 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 3 Sep 2021 14:47:52 +0200 Subject: [PATCH 02/50] several changes and bug fixes --- grimer/callbacks.py | 8 ++++- grimer/cds.py | 13 +++++--- grimer/grimer.py | 28 +++++++++------- grimer/plots.py | 3 +- grimer/prop.py | 79 +++++++++++++++++++++++++++++++++++++++++++++ grimer/utils.py | 20 +++++++++--- 6 files changed, 127 insertions(+), 24 deletions(-) create mode 100644 grimer/prop.py diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 3689965..7cfa129 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -573,6 +573,7 @@ def link_obstable_filter(ele, cds_p_obstable, active_ranks): if (name_multichoice.value.length > 0 ){ var found = false; for(let r = 0; r < active_ranks.length; r++){ + // Compare all names on multichoice (array) against cell if (name_multichoice.value.indexOf(cds_p_obstable.data["tax|"+active_ranks[r]][i]) >= 0){ found = true; break; @@ -582,10 +583,14 @@ def link_obstable_filter(ele, cds_p_obstable, active_ranks): continue; } } - indices.push(i) + indices.push(i); } + console.log(cds_p_obstable); + console.log(widgets_filter); widgets_filter.indices = indices; cds_p_obstable.change.emit(); + console.log(cds_p_obstable); + console.log(widgets_filter); ''') ele["obstable"]["wid"]["frequency_spinner"].js_on_change('value', filter_callback) ele["obstable"]["wid"]["counts_perc_avg_spinner"].js_on_change('value', filter_callback) @@ -616,6 +621,7 @@ def link_correlation_widgets(ele, cds_p_correlation): pval_spinner=ele["correlation"]["wid"]["pval_spinner"], cds_p_correlation=cds_p_correlation), code=''' + console.log("filter_callback"); const indices = []; for (var i = 0; i < cds_p_correlation.data["index"].length; i++) { if (cds_p_correlation.data["pval_corr"][i] > pval_spinner.value) continue; diff --git a/grimer/cds.py b/grimer/cds.py index be90b1f..d22b7a7 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -109,19 +109,14 @@ def generate_cds_obstable(table, tax, contaminants, references, controls, contro freq_perc_control = control_table.gt(0).sum(axis=0) / control_table.shape[0] df_rank["col|" + desc] = table.observations(rank).map(freq_perc_control).fillna(0).to_list() - # # Add col for each rank with parent taxid if exists, linking entries in their lineage for filtering and plotting - # Add col for each rank with parent taxid if exists, linking entries in their lineage for filtering and plotting - for other_rank in table.ranks(): - if table.ranks().index(other_rank) > table.ranks().index(rank): df_rank["tax|" + other_rank] = "" elif other_rank != rank: df_rank["tax|" + other_rank] = table.observations(rank).map(lambda txid: table.get_lineage(txid, rank, other_rank)).fillna("") else: df_rank["tax|" + other_rank] = df_rank.index - # Sort values by frequency to show on table df_rank.sort_values(by="col|frequency_perc", ascending=False, inplace=True) @@ -407,11 +402,19 @@ def generate_cds_correlation(table, top_obs_corr): top_taxids = sorted(table.observations(rank)) matrix = table.data[rank] + matrix.to_csv(rank + "_top.tsv", sep="\t", header=True, index=True) + # No correlation with just one observation if len(matrix.columns) >= 2: rho, pval = stats.spearmanr(matrix) + #from grimer.prop import get_prop_matrix, rho + #from skbio.stats.composition import clr + + #rho = get_prop_matrix(transform_table(matrix, 0, "", 0.000001).values, rho, clr) + #print(rho) + if len(matrix.columns) == 2: # If there are only 2 observations, return in a float # re-format in a matrix shape diff --git a/grimer/grimer.py b/grimer/grimer.py index a5032e2..7aaaa83 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -38,7 +38,7 @@ def main(): parser.add_argument('-m', '--metadata', type=str, help="Input metadata file in simple tabular format. Sample identifiers will be matched with ones provided by --input-table. QIIME 2 metadata format is also accepted, with categorical and numerical fields.") parser.add_argument('-t', '--tax', type=str, default=None, help="Define taxonomy to use. By default, do not use any taxonomy.", choices=["ncbi", "gtdb", "silva", "greengenes", "ott"]) parser.add_argument('-b', '--tax-files', nargs="*", type=str, default=None, help="Taxonomy files. If not provided, will automatically be downloaded.") - + parser.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") parser.add_argument('-r', '--ranks', nargs="*", default=[default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + default_rank_name + "' to use entries from the table directly. Default: " + default_rank_name) parser.add_argument('-l', '--title', type=str, default="", help="Title to display on the header of the report.") parser.add_argument('-o', '--output-html', type=str, default="output.html", help="File to output report. Default: output.html") @@ -62,7 +62,6 @@ def main(): heatmap_group = parser.add_argument_group('Heatmap and clustering options') heatmap_group.add_argument('-a', '--transformation', type=str, default="log", help="none (counts), norm (percentage), log (log10), clr (centre log ratio). Default: log") - heatmap_group.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") heatmap_group.add_argument('-e', '--metadata-cols', type=int, default=5, help="How many metadata cols to show on the heatmap. Higher values makes plot slower to navigate.") heatmap_group.add_argument('--optimal-ordering', default=False, action='store_true', help="Activate optimal_ordering on linkage, takes longer for large number of samples.") heatmap_group.add_argument('--show-zeros', default=False, action='store_true', help="Do not skip zeros on heatmap. File will be bigger and iteraction with heatmap slower.") @@ -121,7 +120,6 @@ def main(): args.transpose = True table_df, total, unassigned = parse_input_table(args.input_file, args.unassigned_header, args.transpose, args.min_frequency, args.max_frequency, args.min_count, args.max_count) - if args.level_separator: ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace) else: @@ -133,19 +131,18 @@ def main(): table = Table(table_df.index, total, unassigned) table.lineage = lineage + print_log("Samples: " + str(len(table.samples))) + print_log("Observations: ") for r, t in ranked_tables.items(): + print_log(" " + r + ":") if t.empty: - print_log("Skipping rank without valid entries (" + r + ")") + print_log("Skipping without valid entries") else: - table.add_rank(r, t) + # Trim table for empty zeros rows/cols + table.add_rank(r, trim_table(t)) + print_log(" " + str(len(table.observations(r))) + " observations") print_log("") - print_log("Samples: " + str(len(table.samples))) - print_log("Observations: ") - - for rank in table.ranks(): - print_log(" - " + rank + ": " + str(len(table.observations(rank)))) - print_log("Total assigned (sum): " + str(table.total.sum())) print_log("Total unassigned (sum): " + str(table.unassigned.sum())) print_log("") @@ -402,8 +399,15 @@ def main(): os.path.join(script_dir, "js", "popup.js"): "script", os.path.join(script_dir, "css", "popup.css"): "style"}) + if args.full_offline: + mode = "inline" # configure to provide entire Bokeh JS and CSS inline + elif _debug: + mode = "absolute-dev" # non-minimized - configure to load from the installed Bokeh library static directory + else: + mode = "cdn" # configure to load Bokeh JS and CSS from https://cdn.bokeh.org + # setup output file and JS mode - output_file(args.output_html, title="GRIMER" if not args.title else "GRIMER - " + args.title, mode="inline" if args.full_offline else "cdn") + output_file(args.output_html, title="GRIMER" if not args.title else "GRIMER - " + args.title, mode=mode) save(final_layout, template=template) print_log("File: " + args.output_html) file_size_bytes = os.path.getsize(args.output_html) diff --git a/grimer/plots.py b/grimer/plots.py index 9a53835..079d17e 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -278,6 +278,7 @@ def plot_obstable(cds_p_obstable, ranks, contaminant_names, control_names): for rank in ranks: rank_filter = GroupFilter(column_name='col|rank', group=rank) cds_view = CDSView(source=cds_p_obstable, filters=[rank_filter, widgets_filter]) + table_cols = [] table_cols.append(TableColumn(field="col|name", title="Name")) table_cols.append(TableColumn(field="col|frequency_perc", title="Frequency", default_sort="descending", formatter=NumberFormatter(format="0.00%"))) @@ -320,7 +321,7 @@ def plot_obstable_widgets(dict_d_taxname, max_count_rank): # Create unique list of names with taxids for filtering. map to str and set to get unique unique_dict_d_taxname_tuples = set(zip(dict_d_taxname.keys(), map(str, dict_d_taxname.values()))) name_multichoice = MultiChoice(title="Obs. name or identifier", options=list(unique_dict_d_taxname_tuples), sizing_mode="fixed", width=250, height=100) - + help_text = """ Summary of observations among all samples. If taxonomy is provided, panels will show different taxonomic ranks. diff --git a/grimer/prop.py b/grimer/prop.py new file mode 100644 index 0000000..0472bc1 --- /dev/null +++ b/grimer/prop.py @@ -0,0 +1,79 @@ +import numpy as np + + +def vlr(x, y): + return np.var(x - y, ddof=1) # - 2 * np.cov(x, y, ddof=1)[1, 0] + np.var(x, ddof=1) + np.var(y, ddof=1) + + +def phi(x, y): + return vlr(x, y) / np.var(y, ddof=1) + + +def rho(x,y): + return 1 - (vlr(x, y) / (np.var(x, ddof=1) + np.var(y, ddof=1))) + + +def phs(x, y): + r = rho(x, y) + return (1 - r) / (1 + r) + + +def get_prop_matrix(mat, func): + r, c = mat.shape + corr_mat = np.zeros((c, c)) + for i in range(c): + for j in range(c): + corr_mat[i, j] = func(mat[:, i], mat[:, j]) + return corr_mat + + +def pairwise_vlr(mat): + cov = np.cov(mat.T, ddof=1) + diagonal = np.diagonal(cov) + return -2 * cov + diagonal[:, np.newaxis] + diagonal + + +def pairwise_phi(mat): + return pairwise_vlr(mat) / np.var(mat, axis=0, ddof=1) + + +def pairwise_rho(mat): + variances = np.var(mat, axis=0, ddof=1) + return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) + + +def pairwise_phs(mat): + r = pairwise_rho(mat) + return (1 - r) / (1 + r) + + +# from skbio.stats.composition import clr +# counts = np.array([[12,2,3,4],[5,6,7,8],[9,11,12,13]])# print(get_prop_matrix(counts, vlr, np.log)) +# counts = clr(counts) + +# print(get_prop_matrix(counts, vlr)) +# print(get_prop_matrix(counts, phi)) +# print(get_prop_matrix(counts, phs)) +# print(get_prop_matrix(counts, rho)) + +# print(pairwise_vlr(counts)) +# print(pairwise_phi(counts)) +# print(pairwise_phs(counts)) +# print(pairwise_rho(counts)) + +# # compare to each other +# print(np.isclose(get_prop_matrix(counts, vlr), pairwise_vlr(counts)).all()) +# print(np.isclose(get_prop_matrix(counts, phi), pairwise_phi(counts)).all()) +# print(np.isclose(get_prop_matrix(counts, phs), pairwise_phs(counts)).all()) +# print(np.isclose(get_prop_matrix(counts, rho), pairwise_rho(counts)).all()) + +# # compare to propr +# from rpy2.robjects.packages import importr +# from rpy2.robjects import numpy2ri +# propr = importr("propr") +# numpy2ri.activate() + +# print(np.isclose(propr.lr2vlr(counts), pairwise_vlr(counts)).all()) +# print(np.isclose(propr.lr2phi(counts), pairwise_phi(counts)).all()) +# print(np.isclose(propr.lr2phs(counts), pairwise_phs(counts)).all()) +# print(np.isclose(propr.lr2rho(counts), pairwise_rho(counts)).all()) diff --git a/grimer/utils.py b/grimer/utils.py index 533bd3f..a064184 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -60,7 +60,7 @@ def parse_input_table(input_file, unassigned_header, transpose, min_frequency, m print_log("") print_log("- Filtering table") - table_df = filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count) + table_df = trim_table(filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count)) # Filter based on the table unassigned = unassigned.reindex(table_df.index) @@ -109,6 +109,10 @@ def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, table_df = table_df.loc[:, table_df_freq <= max_frequency] print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --max-frequency " + str(max_frequency)) + return table_df + + +def trim_table(table_df): # Check for cols/rows with sum zero zero_rows = table_df.sum(axis=1) == 0 if any(zero_rows): @@ -163,9 +167,9 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): else: updated_nodes = update_tax_nodes(rank_nodes, tax) - # Add nan to convert empty ranks - updated_nodes[np.nan] = tax.undefined_node - ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] else t) + # Add nan to keep missing ranks (different than tax.undefined_node [None] which will keep the name) + updated_nodes[np.nan] = np.nan + ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] != np.nan else t) del updated_nodes[np.nan] unmatched_nodes += list(updated_nodes.values()).count(tax.undefined_node) @@ -182,7 +186,7 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r) # Set to NaN to keep shape of ranks_df ranks_df.loc[ranks_df[r].isin(invalid), r] = np.nan - + ranked_tables = {} for i, r in parsed_ranks.items(): # ranks_df and table_df.T have the same shape @@ -265,6 +269,11 @@ def fdrcorrection_bh(pvals): def transform_table(df, total_counts, transformation, replace_zero_value): + # Special case clr with one observation (result in zeros) + if transformation == "clr" and df.shape[1] == 1: + print_log("WARNING: using log instead of clr with one observation") + transformation = "log" + if transformation == "log": transformed_df = (df + replace_zero_value).apply(np.log10) elif transformation == "clr": @@ -412,6 +421,7 @@ def run_hclustering(table, linkage_methods, linkage_metrics, transformation, rep dendro = {} for rank in table.ranks(): + # Get .values of transform, numpy array matrix = transform_table(table.data[rank], table.total, transformation, replace_zero_value).values From f21dc0bf64f6114f6875d16c98f574f41a275ef8 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 24 Sep 2021 16:19:56 +0200 Subject: [PATCH 03/50] new source class, new plot contaminants --- env.yaml | 4 +- grimer/callbacks.py | 32 +- grimer/cds.py | 29 +- grimer/composition.py | 1639 +++++++++++++++++++++++++++++++++++++++++ grimer/grimer.py | 7 + grimer/layout.py | 1 + grimer/plots.py | 34 + grimer/source.py | 89 ++- grimer/sourceold.py | 77 ++ grimer/utils.py | 30 +- 10 files changed, 1875 insertions(+), 67 deletions(-) create mode 100644 grimer/composition.py create mode 100644 grimer/sourceold.py diff --git a/env.yaml b/env.yaml index 1bdbfaa..ad3dca1 100644 --- a/env.yaml +++ b/env.yaml @@ -15,4 +15,6 @@ dependencies: - bioconductor-decontam==1.10.0 #DECONTAM - r-optparse==1.6.6 #DECONTAM - biom-format>=2.1.10 #biom - - jsonapi-client>=0.9.7 #mgnify scripts \ No newline at end of file + - jsonapi-client>=0.9.7 #mgnify scripts +# - r-propr #propr +# - rpy2 #propr \ No newline at end of file diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 7cfa129..d36eb53 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -10,6 +10,7 @@ def link_obstable_samplebars(ele, cds_p_decontam, cds_p_decontam_models, cds_d_decontam, + cds_p_contaminants, active_ranks, min_obs_perc, max_total_count, @@ -316,7 +317,36 @@ def link_obstable_samplebars(ele, cds_p_mgnify.change.emit(); ''') - obstable_callbacks = [plot_obs_callback, change_text_legend_obs_callback, sort_groupby_callback, load_infopanel] + contaminants_callback = CustomJS( + args=dict(contaminants_fig=ele["contaminants"]["fig"], + contaminants_filter=ele["contaminants"]["filter"], + cds_p_obstable=cds_p_obstable, + cds_p_contaminants=cds_p_contaminants, + active_ranks=active_ranks), + code=''' + console.log("contaminants_callback"); + // selected row + const row = cds_p_obstable.selected.indices[0]; + const indices = []; + if (row!=undefined){ + for(let i = 0; i < cds_p_contaminants.length; i++){ + // for each rank + for(let r = 0; r < active_ranks.length; r++){ + // get taxid of the rank + let rank_obs = cds_p_obstable.data["tax|"+active_ranks[r]][row]; + if(cds_p_contaminants.data["obs"][i]==rank_obs && + cds_p_contaminants.data["cont"][i]=="CC Bacteria"){ + indices.push(i); + } + } + } + } + console.log(indices); + contaminants_filter.indices = indices; + cds_p_contaminants.change.emit(); + ''') + + obstable_callbacks = [plot_obs_callback, change_text_legend_obs_callback, sort_groupby_callback, load_infopanel, contaminants_callback] if cds_p_decontam: obstable_callbacks.append(decontam_callback) if cds_p_mgnify: diff --git a/grimer/cds.py b/grimer/cds.py index d22b7a7..91f2d13 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -21,6 +21,23 @@ def generate_dict_taxname(tax, taxids): return id_name +def generate_cds_plot_contaminants(table, tax, contaminants): + + clist = [] + for rank in table.ranks(): + for obs in table.observations(rank): + for desc, cont in contaminants.items(): + direct = cont.get_refs_count(obs, direct=True) + child = cont.get_refs_count(obs, children=True) + parent = cont.get_refs_count(obs, parents=True) + if direct + child + parent > 0: + clist.append([obs, rank, desc, direct, child, parent]) + + df_contaminants = pd.DataFrame(clist, columns=["obs", "rank", "cont", "direct", "child", "parent"]) + + print_df(df_contaminants, "df_contaminants -> cds_p_contaminants") + return ColumnDataSource(df_contaminants) + def generate_cds_annotations(table, contaminants, references, controls, decontam): # Stacked matrix of true annotations (omit false) # index -> taxids @@ -35,10 +52,10 @@ def generate_cds_annotations(table, contaminants, references, controls, decontam df_rank["decontam"] = decontam.get_contaminants(rank, df_rank.index) for desc, ref in references.items(): - df_rank[desc] = table.observations(rank).isin(ref.lineage) + df_rank[desc] = table.observations(rank).isin(ref.lineage()) for desc, cont in contaminants.items(): - df_rank[desc] = table.observations(rank).isin(cont.lineage) + df_rank[desc] = table.observations(rank).isin(cont.lineage()) if controls: for desc, ctrl in controls.items(): @@ -90,7 +107,7 @@ def generate_cds_obstable(table, tax, contaminants, references, controls, contro df_rank["col|references"] = "" for desc, ref in references.items(): # Check if taxids are in the lineage of the reference - bool_ref = table.observations(rank).isin(ref.lineage) + bool_ref = table.observations(rank).isin(ref.ids) df_rank["col|references"] = df_rank["col|references"] + np.where(bool_ref, desc + " | ", "") # Add a column for each Contaminant source @@ -98,7 +115,7 @@ def generate_cds_obstable(table, tax, contaminants, references, controls, contro #print(table.observations(rank).isin(cont.ids)) #print(table.observations(rank).isin(cont.lineage)) - df_rank["col|" + desc] = table.observations(rank).map(cont.get_refs_count).to_list() + df_rank["col|" + desc] = table.observations(rank).map(lambda x: cont.get_refs_count(x, direct=True)).to_list() #df_rank["col|" + desc] = table.observations(rank).isin(cont.ids) # Add a column for each Control source @@ -175,7 +192,7 @@ def generate_cds_samples(table, references, contaminants, controls, decontam): for sources in source_list: for desc, src in sources: for rank in table.ranks(): - idx = table.observations(rank).isin(src.lineage) + idx = table.observations(rank).isin(src.lineage()) df_samples["cnt|" + rank + "|" + desc] = table.data[rank][table.observations(rank)[idx]].sum(axis=1) if decontam: @@ -371,7 +388,7 @@ def generate_dict_refs(table, contaminants, references): for i in used_ids: for source in [contaminants.items(), references.items()]: for sname, s in source: - for ref, descs in s.get_refs_desc(i).items(): + for ref, descs in s.get_refs_desc(i, direct=True).items(): for desc in descs: # Only add items if they have a reference to it if i not in d_refs: diff --git a/grimer/composition.py b/grimer/composition.py new file mode 100644 index 0000000..95a466b --- /dev/null +++ b/grimer/composition.py @@ -0,0 +1,1639 @@ +r""" +Composition Statistics (:mod:`skbio.stats.composition`) +======================================================= + +.. currentmodule:: skbio.stats.composition + +This module provides functions for compositional data analysis. + +Many 'omics datasets are inherently compositional - meaning that they +are best interpreted as proportions or percentages rather than +absolute counts. + +Formally, :math:`x` is a composition if :math:`\sum_{i=0}^D x_{i} = c` +and :math:`x_{i} > 0`, :math:`1 \leq i \leq D` and :math:`c` is a real +valued constant and there are :math:`D` components for each +composition. In this module :math:`c=1`. Compositional data can be +analyzed using Aitchison geometry. [1]_ + +However, in this framework, standard real Euclidean operations such as +addition and multiplication no longer apply. Only operations such as +perturbation and power can be used to manipulate this data. + +This module allows two styles of manipulation of compositional data. +Compositional data can be analyzed using perturbation and power +operations, which can be useful for simulation studies. The +alternative strategy is to transform compositional data into the real +space. Right now, the centre log ratio transform (clr) and +the isometric log ratio transform (ilr) [2]_ can be used to accomplish +this. This transform can be useful for performing standard statistical +tools such as parametric hypothesis testing, regressions and more. + +The major caveat of using this framework is dealing with zeros. In +the Aitchison geometry, only compositions with nonzero components can +be considered. The multiplicative replacement technique [3]_ can be +used to substitute these zeros with small pseudocounts without +introducing major distortions to the data. + +Functions +--------- + +.. autosummary:: + :toctree: + + closure + multiplicative_replacement + perturb + perturb_inv + power + inner + clr + clr_inv + ilr + ilr_inv + alr + alr_inv + centralize + ancom + sbp_basis + +References +---------- +.. [1] V. Pawlowsky-Glahn, J. J. Egozcue, R. Tolosana-Delgado (2015), + Modeling and Analysis of Compositional Data, Wiley, Chichester, UK + +.. [2] J. J. Egozcue., "Isometric Logratio Transformations for + Compositional Data Analysis" Mathematical Geology, 35.3 (2003) + +.. [3] J. A. Martin-Fernandez, "Dealing With Zeros and Missing Values in + Compositional Data Sets Using Nonparametric Imputation", + Mathematical Geology, 35.3 (2003) + + +Examples +-------- + +>>> import numpy as np + +Consider a very simple environment with only 3 species. The species +in the environment are equally distributed and their proportions are +equivalent: + +>>> otus = np.array([1./3, 1./3., 1./3]) + +Suppose that an antibiotic kills off half of the population for the +first two species, but doesn't harm the third species. Then the +perturbation vector would be as follows + +>>> antibiotic = np.array([1./2, 1./2, 1]) + +And the resulting perturbation would be + +>>> perturb(otus, antibiotic) +array([ 0.25, 0.25, 0.5 ]) + +""" + +# ---------------------------------------------------------------------------- +# Copyright (c) 2013--, scikit-bio development team. +# +# Distributed under the terms of the Modified BSD License. +# +# The full license is in the file COPYING.txt, distributed with this software. +# ---------------------------------------------------------------------------- +import numpy as np +import pandas as pd +import scipy.stats +import skbio.util +from skbio.util._decorator import experimental +from skbio.stats.distance import DistanceMatrix + + +@experimental(as_of="0.4.0") +def closure(mat): + """ + Performs closure to ensure that all elements add up to 1. + + Parameters + ---------- + mat : array_like + a matrix of proportions where + rows = compositions + columns = components + + Returns + ------- + array_like, np.float64 + A matrix of proportions where all of the values + are nonzero and each composition (row) adds up to 1 + + Raises + ------ + ValueError + Raises an error if any values are negative. + ValueError + Raises an error if the matrix has more than 2 dimension. + ValueError + Raises an error if there is a row that has all zeros. + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import closure + >>> X = np.array([[2, 2, 6], [4, 4, 2]]) + >>> closure(X) + array([[ 0.2, 0.2, 0.6], + [ 0.4, 0.4, 0.2]]) + + """ + mat = np.atleast_2d(mat) + if np.any(mat < 0): + raise ValueError("Cannot have negative proportions") + if mat.ndim > 2: + raise ValueError("Input matrix can only have two dimensions or less") + if np.all(mat == 0, axis=1).sum() > 0: + raise ValueError("Input matrix cannot have rows with all zeros") + mat = mat / mat.sum(axis=1, keepdims=True) + return mat.squeeze() + + +@experimental(as_of="0.4.0") +def multiplicative_replacement(mat, delta=None): + r"""Replace all zeros with small non-zero values + + It uses the multiplicative replacement strategy [1]_ , + replacing zeros with a small positive :math:`\delta` + and ensuring that the compositions still add up to 1. + + + Parameters + ---------- + mat: array_like + a matrix of proportions where + rows = compositions and + columns = components + delta: float, optional + a small number to be used to replace zeros + If delta is not specified, then the default delta is + :math:`\delta = \frac{1}{N^2}` where :math:`N` + is the number of components + + Returns + ------- + numpy.ndarray, np.float64 + A matrix of proportions where all of the values + are nonzero and each composition (row) adds up to 1 + + Raises + ------ + ValueError + Raises an error if negative proportions are created due to a large + `delta`. + + Notes + ----- + This method will result in negative proportions if a large delta is chosen. + + References + ---------- + .. [1] J. A. Martin-Fernandez. "Dealing With Zeros and Missing Values in + Compositional Data Sets Using Nonparametric Imputation" + + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import multiplicative_replacement + >>> X = np.array([[.2,.4,.4, 0],[0,.5,.5,0]]) + >>> multiplicative_replacement(X) + array([[ 0.1875, 0.375 , 0.375 , 0.0625], + [ 0.0625, 0.4375, 0.4375, 0.0625]]) + + """ + mat = closure(mat) + z_mat = (mat == 0) + + num_feats = mat.shape[-1] + tot = z_mat.sum(axis=-1, keepdims=True) + + if delta is None: + delta = (1. / num_feats)**2 + + zcnts = 1 - tot * delta + if np.any(zcnts) < 0: + raise ValueError('The multiplicative replacement created negative ' + 'proportions. Consider using a smaller `delta`.') + mat = np.where(z_mat, delta, zcnts * mat) + return mat.squeeze() + + +@experimental(as_of="0.4.0") +def perturb(x, y): + r""" + Performs the perturbation operation. + + This operation is defined as + + .. math:: + x \oplus y = C[x_1 y_1, \ldots, x_D y_D] + + :math:`C[x]` is the closure operation defined as + + .. math:: + C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, + \frac{x_D}{\sum_{i=1}^{D} x_i} \right] + + for some :math:`D` dimensional real vector :math:`x` and + :math:`D` is the number of components for every composition. + + Parameters + ---------- + x : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + y : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + + Returns + ------- + numpy.ndarray, np.float64 + A matrix of proportions where all of the values + are nonzero and each composition (row) adds up to 1 + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import perturb + >>> x = np.array([.1,.3,.4, .2]) + >>> y = np.array([1./6,1./6,1./3,1./3]) + >>> perturb(x,y) + array([ 0.0625, 0.1875, 0.5 , 0.25 ]) + + """ + x, y = closure(x), closure(y) + return closure(x * y) + + +@experimental(as_of="0.4.0") +def perturb_inv(x, y): + r""" + Performs the inverse perturbation operation. + + This operation is defined as + + .. math:: + x \ominus y = C[x_1 y_1^{-1}, \ldots, x_D y_D^{-1}] + + :math:`C[x]` is the closure operation defined as + + .. math:: + C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, + \frac{x_D}{\sum_{i=1}^{D} x_i} \right] + + + for some :math:`D` dimensional real vector :math:`x` and + :math:`D` is the number of components for every composition. + + Parameters + ---------- + x : array_like + a matrix of proportions where + rows = compositions and + columns = components + y : array_like + a matrix of proportions where + rows = compositions and + columns = components + + Returns + ------- + numpy.ndarray, np.float64 + A matrix of proportions where all of the values + are nonzero and each composition (row) adds up to 1 + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import perturb_inv + >>> x = np.array([.1,.3,.4, .2]) + >>> y = np.array([1./6,1./6,1./3,1./3]) + >>> perturb_inv(x,y) + array([ 0.14285714, 0.42857143, 0.28571429, 0.14285714]) + """ + x, y = closure(x), closure(y) + return closure(x / y) + + +@experimental(as_of="0.4.0") +def power(x, a): + r""" + Performs the power operation. + + This operation is defined as follows + + .. math:: + `x \odot a = C[x_1^a, \ldots, x_D^a] + + :math:`C[x]` is the closure operation defined as + + .. math:: + C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, + \frac{x_D}{\sum_{i=1}^{D} x_i} \right] + + for some :math:`D` dimensional real vector :math:`x` and + :math:`D` is the number of components for every composition. + + Parameters + ---------- + x : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + a : float + a scalar float + + Returns + ------- + numpy.ndarray, np.float64 + A matrix of proportions where all of the values + are nonzero and each composition (row) adds up to 1 + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import power + >>> x = np.array([.1,.3,.4, .2]) + >>> power(x, .1) + array([ 0.23059566, 0.25737316, 0.26488486, 0.24714631]) + + """ + x = closure(x) + return closure(x**a).squeeze() + + +@experimental(as_of="0.4.0") +def inner(x, y): + r""" + Calculates the Aitchson inner product. + + This inner product is defined as follows + + .. math:: + \langle x, y \rangle_a = + \frac{1}{2D} \sum\limits_{i=1}^{D} \sum\limits_{j=1}^{D} + \ln\left(\frac{x_i}{x_j}\right) \ln\left(\frac{y_i}{y_j}\right) + + Parameters + ---------- + x : array_like + a matrix of proportions where + rows = compositions and + columns = components + y : array_like + a matrix of proportions where + rows = compositions and + columns = components + + Returns + ------- + numpy.ndarray + inner product result + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import inner + >>> x = np.array([.1, .3, .4, .2]) + >>> y = np.array([.2, .4, .2, .2]) + >>> inner(x, y) # doctest: +ELLIPSIS + 0.2107852473... + """ + x = closure(x) + y = closure(y) + a, b = clr(x), clr(y) + return a.dot(b.T) + + +@experimental(as_of="0.4.0") +def clr(mat): + r""" + Performs centre log ratio transformation. + + This function transforms compositions from Aitchison geometry to + the real space. The :math:`clr` transform is both an isometry and an + isomorphism defined on the following spaces + + :math:`clr: S^D \rightarrow U` + + where :math:`U= + \{x :\sum\limits_{i=1}^D x = 0 \; \forall x \in \mathbb{R}^D\}` + + It is defined for a composition :math:`x` as follows: + + .. math:: + clr(x) = \ln\left[\frac{x_1}{g_m(x)}, \ldots, \frac{x_D}{g_m(x)}\right] + + where :math:`g_m(x) = (\prod\limits_{i=1}^{D} x_i)^{1/D}` is the geometric + mean of :math:`x`. + + Parameters + ---------- + mat : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + + Returns + ------- + numpy.ndarray + clr transformed matrix + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import clr + >>> x = np.array([.1, .3, .4, .2]) + >>> clr(x) + array([-0.79451346, 0.30409883, 0.5917809 , -0.10136628]) + + """ + mat = closure(mat) + lmat = np.log(mat) + gm = lmat.mean(axis=-1, keepdims=True) + return (lmat - gm).squeeze() + + +@experimental(as_of="0.4.0") +def clr_inv(mat): + r""" + Performs inverse centre log ratio transformation. + + This function transforms compositions from the real space to + Aitchison geometry. The :math:`clr^{-1}` transform is both an isometry, + and an isomorphism defined on the following spaces + + :math:`clr^{-1}: U \rightarrow S^D` + + where :math:`U= + \{x :\sum\limits_{i=1}^D x = 0 \; \forall x \in \mathbb{R}^D\}` + + This transformation is defined as follows + + .. math:: + clr^{-1}(x) = C[\exp( x_1, \ldots, x_D)] + + Parameters + ---------- + mat : array_like, float + a matrix of real values where + rows = transformed compositions and + columns = components + + Returns + ------- + numpy.ndarray + inverse clr transformed matrix + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import clr_inv + >>> x = np.array([.1, .3, .4, .2]) + >>> clr_inv(x) + array([ 0.21383822, 0.26118259, 0.28865141, 0.23632778]) + + """ + return closure(np.exp(mat)) + + +@experimental(as_of="0.4.0") +def ilr(mat, basis=None, check=True): + r""" + Performs isometric log ratio transformation. + + This function transforms compositions from Aitchison simplex to + the real space. The :math: ilr` transform is both an isometry, + and an isomorphism defined on the following spaces + + :math:`ilr: S^D \rightarrow \mathbb{R}^{D-1}` + + The ilr transformation is defined as follows + + .. math:: + ilr(x) = + [\langle x, e_1 \rangle_a, \ldots, \langle x, e_{D-1} \rangle_a] + + where :math:`[e_1,\ldots,e_{D-1}]` is an orthonormal basis in the simplex. + + If an orthornormal basis isn't specified, the J. J. Egozcue orthonormal + basis derived from Gram-Schmidt orthogonalization will be used by + default. + + Parameters + ---------- + mat: numpy.ndarray + a matrix of proportions where + rows = compositions and + columns = components + + basis: numpy.ndarray, float, optional + orthonormal basis for Aitchison simplex + defaults to J.J.Egozcue orthonormal basis. + + check: bool + Specifies if the basis is orthonormal. + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import ilr + >>> x = np.array([.1, .3, .4, .2]) + >>> ilr(x) + array([-0.7768362 , -0.68339802, 0.11704769]) + + Notes + ----- + If the `basis` parameter is specified, it is expected to be a basis in the + Aitchison simplex. If there are `D-1` elements specified in `mat`, then + the dimensions of the basis needs be `D-1 x D`, where rows represent + basis vectors, and the columns represent proportions. + """ + mat = closure(mat) + if basis is None: + basis = clr_inv(_gram_schmidt_basis(mat.shape[-1])) + else: + if len(basis.shape) != 2: + raise ValueError("Basis needs to be a 2D matrix, " + "not a %dD matrix." % + (len(basis.shape))) + if check: + _check_orthogonality(basis) + + return inner(mat, basis) + + +@experimental(as_of="0.4.0") +def ilr_inv(mat, basis=None, check=True): + r""" + Performs inverse isometric log ratio transform. + + This function transforms compositions from the real space to + Aitchison geometry. The :math:`ilr^{-1}` transform is both an isometry, + and an isomorphism defined on the following spaces + + :math:`ilr^{-1}: \mathbb{R}^{D-1} \rightarrow S^D` + + The inverse ilr transformation is defined as follows + + .. math:: + ilr^{-1}(x) = \bigoplus\limits_{i=1}^{D-1} x \odot e_i + + where :math:`[e_1,\ldots, e_{D-1}]` is an orthonormal basis in the simplex. + + If an orthonormal basis isn't specified, the J. J. Egozcue orthonormal + basis derived from Gram-Schmidt orthogonalization will be used by + default. + + + Parameters + ---------- + mat: numpy.ndarray, float + a matrix of transformed proportions where + rows = compositions and + columns = components + + basis: numpy.ndarray, float, optional + orthonormal basis for Aitchison simplex + defaults to J.J.Egozcue orthonormal basis + + check: bool + Specifies if the basis is orthonormal. + + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import ilr + >>> x = np.array([.1, .3, .6,]) + >>> ilr_inv(x) + array([ 0.34180297, 0.29672718, 0.22054469, 0.14092516]) + + Notes + ----- + If the `basis` parameter is specified, it is expected to be a basis in the + Aitchison simplex. If there are `D-1` elements specified in `mat`, then + the dimensions of the basis needs be `D-1 x D`, where rows represent + basis vectors, and the columns represent proportions. + """ + + if basis is None: + basis = _gram_schmidt_basis(mat.shape[-1] + 1) + else: + if len(basis.shape) != 2: + raise ValueError("Basis needs to be a 2D matrix, " + "not a %dD matrix." % + (len(basis.shape))) + if check: + _check_orthogonality(basis) + # this is necessary, since the clr function + # performs np.squeeze() + basis = np.atleast_2d(clr(basis)) + + return clr_inv(np.dot(mat, basis)) + + +@experimental(as_of="0.5.5") +def alr(mat, denominator_idx=0): + r""" + Performs additive log ratio transformation. + + This function transforms compositions from a D-part Aitchison simplex to + a non-isometric real space of D-1 dimensions. The argument + `denominator_col` defines the index of the column used as the common + denominator. The :math: `alr` transformed data are amenable to multivariate + analysis as long as statistics don't involve distances. + + :math:`alr: S^D \rightarrow \mathbb{R}^{D-1}` + + The alr transformation is defined as follows + + .. math:: + alr(x) = \left[ \ln \frac{x_1}{x_D}, \ldots, + \ln \frac{x_{D-1}}{x_D} \right] + + where :math:`D` is the index of the part used as common denominator. + + Parameters + ---------- + mat: numpy.ndarray + a matrix of proportions where + rows = compositions and + columns = components + + denominator_idx: int + the index of the column (2D-matrix) or position (vector) of + `mat` which should be used as the reference composition. By default + `denominator_idx=0` to specify the first column or position. + + Returns + ------- + numpy.ndarray + alr-transformed data projected in a non-isometric real space + of D-1 dimensions for a D-parts composition + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import alr + >>> x = np.array([.1, .3, .4, .2]) + >>> alr(x) + array([ 1.09861229, 1.38629436, 0.69314718]) + """ + mat = closure(mat) + if mat.ndim == 2: + mat_t = mat.T + numerator_idx = list(range(0, mat_t.shape[0])) + del numerator_idx[denominator_idx] + lr = np.log(mat_t[numerator_idx, :]/mat_t[denominator_idx, :]).T + elif mat.ndim == 1: + numerator_idx = list(range(0, mat.shape[0])) + del numerator_idx[denominator_idx] + lr = np.log(mat[numerator_idx]/mat[denominator_idx]) + else: + raise ValueError("mat must be either 1D or 2D") + return lr + + +@experimental(as_of="0.5.5") +def alr_inv(mat, denominator_idx=0): + r""" + Performs inverse additive log ratio transform. + + This function transforms compositions from the non-isometric real space of + alrs to Aitchison geometry. + + :math:`alr^{-1}: \mathbb{R}^{D-1} \rightarrow S^D` + + The inverse alr transformation is defined as follows + + .. math:: + alr^{-1}(x) = C[exp([y_1, y_2, ..., y_{D-1}, 0])] + + where :math:`C[x]` is the closure operation defined as + + .. math:: + C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, + \frac{x_D}{\sum_{i=1}^{D} x_i} \right] + + for some :math:`D` dimensional real vector :math:`x` and + :math:`D` is the number of components for every composition. + + Parameters + ---------- + mat: numpy.ndarray + a matrix of alr-transformed data + denominator_idx: int + the index of the column (2D-composition) or position (1D-composition) of + the output where the common denominator should be placed. By default + `denominator_idx=0` to specify the first column or position. + + Returns + ------- + numpy.ndarray + Inverse alr transformed matrix or vector where rows sum to 1. + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import alr, alr_inv + >>> x = np.array([.1, .3, .4, .2]) + >>> alr_inv(alr(x)) + array([ 0.1, 0.3, 0.4, 0.2]) + """ + mat = np.array(mat) + if mat.ndim == 2: + mat_idx = np.insert(mat, denominator_idx, + np.repeat(0, mat.shape[0]), axis=1) + comp = np.zeros(mat_idx.shape) + comp[:, denominator_idx] = 1 / (np.exp(mat).sum(axis=1) + 1) + numerator_idx = list(range(0, comp.shape[1])) + del numerator_idx[denominator_idx] + for i in numerator_idx: + comp[:, i] = comp[:, denominator_idx] * np.exp(mat_idx[:, i]) + elif mat.ndim == 1: + mat_idx = np.insert(mat, denominator_idx, 0, axis=0) + comp = np.zeros(mat_idx.shape) + comp[denominator_idx] = 1 / (np.exp(mat).sum(axis=0) + 1) + numerator_idx = list(range(0, comp.shape[0])) + del numerator_idx[denominator_idx] + for i in numerator_idx: + comp[i] = comp[denominator_idx] * np.exp(mat_idx[i]) + else: + raise ValueError("mat must be either 1D or 2D") + return comp + + +@experimental(as_of="0.4.0") +def centralize(mat): + r"""Center data around its geometric average. + + Parameters + ---------- + mat : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + + Returns + ------- + numpy.ndarray + centered composition matrix + + Examples + -------- + >>> import numpy as np + >>> from skbio.stats.composition import centralize + >>> X = np.array([[.1,.3,.4, .2],[.2,.2,.2,.4]]) + >>> centralize(X) + array([[ 0.17445763, 0.30216948, 0.34891526, 0.17445763], + [ 0.32495488, 0.18761279, 0.16247744, 0.32495488]]) + + """ + mat = closure(mat) + cen = scipy.stats.gmean(mat, axis=0) + return perturb_inv(mat, cen) + + + +@experimental(as_of="0.5.7") +def _vlr(x:np.array, y:np.array, ddof:int): + r""" + Calculates variance log ratio + + Parameters + ---------- + x : array_like, float + a 1-dimensional vector of proportions + y : array_like, float + a 1-dimensional vector of proportions + + ddof: int + degrees of freedom + + + Returns + ------- + float + variance log ratio value + + + References + ---------- + .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) + Proportionality: A Valid Alternative to Correlation for Relative Data. + PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 + .. [2] Erb, I., Notredame, C. + How should we measure proportionality on relative gene expression data?. + Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 + """ + + # Log transformation + x = np.log(x) + y = np.log(y) + + #Variance log ratio + return np.var(x - y, ddof=ddof) + + +@experimental(as_of="0.5.7") +def _robust_vlr(x:np.array, y:np.array, ddof:int): + r""" + Calculates variance log ratio while masking zeros + + Parameters + ---------- + x : array_like, float + a 1-dimensional vector of proportions + y : array_like, float + a 1-dimensional vector of proportions + + ddof: int + degrees of freedom + + Returns + ------- + float + variance log ratio value + + + References + ---------- + .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) + Proportionality: A Valid Alternative to Correlation for Relative Data. + PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 + .. [2] Erb, I., Notredame, C. + How should we measure proportionality on relative gene expression data?. + Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 + """ + + # Mask zeros + x = np.ma.masked_array(x, mask=x == 0) + y = np.ma.masked_array(y, mask=y == 0) + + # Log transformation + x = np.ma.log(x) + y = np.ma.log(y) + + #Variance log ratio + return np.ma.var(x - y, ddof=ddof) + +@experimental(as_of="0.5.7") +def vlr(x:np.array, y:np.array, ddof:int=1, robust:bool=False): + r""" + Calculates variance log ratio + + Parameters + ---------- + x : array_like, float + a 1-dimensional vector of proportions + y : array_like, float + a 1-dimensional vector of proportions + + ddof: int + degrees of freedom + + robust: bool + mask zeros at the cost of performance + + Returns + ------- + float + variance log ratio value + + Examples + -------- + No zeros + >>> x = [1,2,3] + >>> y = [5,8,13] + >>> %timeit vlr(x,y) + >>> 0.01277962183258352 + + Zeros without robust + >>> x = [1,2,3,0] + >>> y = [5,8,13,21] + >>> vlr(x,y) + >>> nan + + Zeros with robust + >>> x = [1,2,3,0] + >>> y = [5,8,13,21] + >>> vlr(x,y, robust=True) + >>> 0.01277962183258352 + + References + ---------- + .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) + Proportionality: A Valid Alternative to Correlation for Relative Data. + PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 + .. [2] Erb, I., Notredame, C. + How should we measure proportionality on relative gene expression data?. + Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 + """ + # Convert array_like to numpy array + x = np.asarray(x) + y = np.asarray(y) + + # Set up input and parameters + kwargs = { + "x":x, + "y":y, + "ddof":ddof, + } + + # Run backend function + if robust: + return _robust_vlr(**kwargs) + else: + return _vlr(**kwargs) + + +@experimental(as_of="0.5.7") +def _pairwise_vlr(mat:np.array, ddof:int): + r""" + Performs pairwise variance log ratio transformation + + Parameters + ---------- + mat : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + + ids: array_like, str + component names + + ddof: int + degrees of freedom + + Returns + ------- + skbio.DistanceMatrix + distance matrix of variance log ratio values + + + References + ---------- + .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) + Proportionality: A Valid Alternative to Correlation for Relative Data. + PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 + .. [2] Erb, I., Notredame, C. + How should we measure proportionality on relative gene expression data?. + Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 + """ + + # Log Transform + X_log = np.log(mat) + + # Variance Log Ratio + covariance = np.cov(X_log.T, ddof=ddof) + diagonal = np.diagonal(covariance) + vlr_data = -2*covariance + diagonal[:,np.newaxis] + diagonal + return vlr_data + +@experimental(as_of="0.5.7") +def _robust_pairwise_vlr(mat:np.array, ddof:int): + r""" + Performs pairwise variance log ratio transformation while masking zeros + + Parameters + ---------- + mat : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + + ids: array_like, str + component names + + ddof: int + degrees of freedom + + Returns + ------- + skbio.DistanceMatrix + distance matrix of variance log ratio values + + + References + ---------- + .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) + Proportionality: A Valid Alternative to Correlation for Relative Data. + PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 + .. [2] Erb, I., Notredame, C. + How should we measure proportionality on relative gene expression data?. + Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 + """ + # Mask zeros + X = np.ma.masked_array(mat, mask=mat == 0) + + # Log Transform + X_log = np.ma.log(X) + + # Variance Log Ratio + covariance = np.ma.cov(X_log.T, ddof=ddof) + diagonal = np.ma.diagonal(covariance) + vlr_data = -2*covariance + diagonal[:,np.newaxis] + diagonal + return vlr_data + +@experimental(as_of="0.5.7") +def pairwise_vlr(mat, ids=None, ddof=1, robust=False) -> DistanceMatrix: + r""" + Performs pairwise variance log ratio transformation + + Parameters + ---------- + mat : array_like, float + a matrix of proportions where + rows = compositions and + columns = components + + ids: array_like, str + component names + + ddof: int + degrees of freedom + + robust: bool + mask zeros at the cost of performance + + Returns + ------- + skbio.DistanceMatrix + distance matrix of variance log ratio values + + Examples + -------- + >>> mat = np.asarray([ + [1,2,3], + [5,8,13], + [21,34,55], + ]) + >>> dism = pairwise_vlr(mat) + >>> dism.redundant_form() + array([[0. , 0.01576411, 0.00649553], + [0.01576411, 0. , 0.00202147], + [0.00649553, 0.00202147, 0. ]]) + + + References + ---------- + .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) + Proportionality: A Valid Alternative to Correlation for Relative Data. + PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 + .. [2] Erb, I., Notredame, C. + How should we measure proportionality on relative gene expression data?. + Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 + """ + + # Mask zeros + mat = mat.astype(np.float64) + mat = closure(mat) + # Set up input and parameters + kwargs = { + "mat":mat, + "ddof":ddof, + } + + # Variance Log Ratio + if robust: + vlr_data = _robust_pairwise_vlr(**kwargs) + else: + vlr_data = _pairwise_vlr(**kwargs) + + # Force symmetry + vlr_data = (vlr_data + vlr_data.T)/2 + + # Create distance matrix + return DistanceMatrix(vlr_data, ids=ids) + + +@experimental(as_of="0.4.1") +def ancom(table, grouping, + alpha=0.05, + tau=0.02, + theta=0.1, + multiple_comparisons_correction='holm-bonferroni', + significance_test=None, + percentiles=(0.0, 25.0, 50.0, 75.0, 100.0)): + r""" Performs a differential abundance test using ANCOM. + + This is done by calculating pairwise log ratios between all features + and performing a significance test to determine if there is a significant + difference in feature ratios with respect to the variable of interest. + + In an experiment with only two treatments, this tests the following + hypothesis for feature :math:`i` + + .. math:: + + H_{0i}: \mathbb{E}[\ln(u_i^{(1)})] = \mathbb{E}[\ln(u_i^{(2)})] + + where :math:`u_i^{(1)}` is the mean abundance for feature :math:`i` in the + first group and :math:`u_i^{(2)}` is the mean abundance for feature + :math:`i` in the second group. + + Parameters + ---------- + table : pd.DataFrame + A 2D matrix of strictly positive values (i.e. counts or proportions) + where the rows correspond to samples and the columns correspond to + features. + grouping : pd.Series + Vector indicating the assignment of samples to groups. For example, + these could be strings or integers denoting which group a sample + belongs to. It must be the same length as the samples in `table`. + The index must be the same on `table` and `grouping` but need not be + in the same order. + alpha : float, optional + Significance level for each of the statistical tests. + This can can be anywhere between 0 and 1 exclusive. + tau : float, optional + A constant used to determine an appropriate cutoff. + A value close to zero indicates a conservative cutoff. + This can can be anywhere between 0 and 1 exclusive. + theta : float, optional + Lower bound for the proportion for the W-statistic. + If all W-statistics are lower than theta, then no features + will be detected to be differentially significant. + This can can be anywhere between 0 and 1 exclusive. + multiple_comparisons_correction : {None, 'holm-bonferroni'}, optional + The multiple comparison correction procedure to run. If None, + then no multiple comparison correction procedure will be run. + If 'holm-boniferroni' is specified, then the Holm-Boniferroni + procedure [1]_ will be run. + significance_test : function, optional + A statistical significance function to test for significance between + classes. This function must be able to accept at least two 1D + array_like arguments of floats and returns a test statistic and a + p-value. By default ``scipy.stats.f_oneway`` is used. + percentiles : iterable of floats, optional + Percentile abundances to return for each feature in each group. By + default, will return the minimum, 25th percentile, median, 75th + percentile, and maximum abundances for each feature in each group. + + Returns + ------- + pd.DataFrame + A table of features, their W-statistics and whether the null hypothesis + is rejected. + + `"W"` is the W-statistic, or number of features that a single feature + is tested to be significantly different against. + + `"Reject null hypothesis"` indicates if feature is differentially + abundant across groups (`True`) or not (`False`). + + pd.DataFrame + A table of features and their percentile abundances in each group. If + ``percentiles`` is empty, this will be an empty ``pd.DataFrame``. The + rows in this object will be features, and the columns will be a + multi-index where the first index is the percentile, and the second + index is the group. + + See Also + -------- + multiplicative_replacement + scipy.stats.ttest_ind + scipy.stats.f_oneway + scipy.stats.wilcoxon + scipy.stats.kruskal + + Notes + ----- + The developers of this method recommend the following significance tests + ([2]_, Supplementary File 1, top of page 11): if there are 2 groups, use + the standard parametric t-test (``scipy.stats.ttest_ind``) or + non-parametric Wilcoxon rank sum test (``scipy.stats.wilcoxon``). + If there are more than 2 groups, use parametric one-way ANOVA + (``scipy.stats.f_oneway``) or nonparametric Kruskal-Wallis + (``scipy.stats.kruskal``). Because one-way ANOVA is equivalent + to the standard t-test when the number of groups is two, we default to + ``scipy.stats.f_oneway`` here, which can be used when there are two or + more groups. Users should refer to the documentation of these tests in + SciPy to understand the assumptions made by each test. + + This method cannot handle any zero counts as input, since the logarithm + of zero cannot be computed. While this is an unsolved problem, many + studies, including [2]_, have shown promising results by adding + pseudocounts to all values in the matrix. In [2]_, a pseudocount of 0.001 + was used, though the authors note that a pseudocount of 1.0 may also be + useful. Zero counts can also be addressed using the + ``multiplicative_replacement`` method. + + References + ---------- + .. [1] Holm, S. "A simple sequentially rejective multiple test procedure". + Scandinavian Journal of Statistics (1979), 6. + .. [2] Mandal et al. "Analysis of composition of microbiomes: a novel + method for studying microbial composition", Microbial Ecology in Health + & Disease, (2015), 26. + + Examples + -------- + First import all of the necessary modules: + + >>> from skbio.stats.composition import ancom + >>> import pandas as pd + + Now let's load in a DataFrame with 6 samples and 7 features (e.g., + these may be bacterial OTUs): + + >>> table = pd.DataFrame([[12, 11, 10, 10, 10, 10, 10], + ... [9, 11, 12, 10, 10, 10, 10], + ... [1, 11, 10, 11, 10, 5, 9], + ... [22, 21, 9, 10, 10, 10, 10], + ... [20, 22, 10, 10, 13, 10, 10], + ... [23, 21, 14, 10, 10, 10, 10]], + ... index=['s1', 's2', 's3', 's4', 's5', 's6'], + ... columns=['b1', 'b2', 'b3', 'b4', 'b5', 'b6', + ... 'b7']) + + Then create a grouping vector. In this example, there is a treatment group + and a placebo group. + + >>> grouping = pd.Series(['treatment', 'treatment', 'treatment', + ... 'placebo', 'placebo', 'placebo'], + ... index=['s1', 's2', 's3', 's4', 's5', 's6']) + + Now run ``ancom`` to determine if there are any features that are + significantly different in abundance between the treatment and the placebo + groups. The first DataFrame that is returned contains the ANCOM test + results, and the second contains the percentile abundance data for each + feature in each group. + + >>> ancom_df, percentile_df = ancom(table, grouping) + >>> ancom_df['W'] + b1 0 + b2 4 + b3 0 + b4 1 + b5 1 + b6 0 + b7 1 + Name: W, dtype: int64 + + The W-statistic is the number of features that a single feature is tested + to be significantly different against. In this scenario, `b2` was detected + to have significantly different abundances compared to four of the other + features. To summarize the results from the W-statistic, let's take a look + at the results from the hypothesis test. The `Reject null hypothesis` + column in the table indicates whether the null hypothesis was rejected, + and that a feature was therefore observed to be differentially abundant + across the groups. + + >>> ancom_df['Reject null hypothesis'] + b1 False + b2 True + b3 False + b4 False + b5 False + b6 False + b7 False + Name: Reject null hypothesis, dtype: bool + + From this we can conclude that only `b2` was significantly different in + abundance between the treatment and the placebo. We still don't know, for + example, in which group `b2` was more abundant. We therefore may next be + interested in comparing the abundance of `b2` across the two groups. + We can do that using the second DataFrame that was returned. Here we + compare the median (50th percentile) abundance of `b2` in the treatment and + placebo groups: + + >>> percentile_df[50.0].loc['b2'] + Group + placebo 21.0 + treatment 11.0 + Name: b2, dtype: float64 + + We can also look at a full five-number summary for ``b2`` in the treatment + and placebo groups: + + >>> percentile_df.loc['b2'] # doctest: +NORMALIZE_WHITESPACE + Percentile Group + 0.0 placebo 21.0 + 25.0 placebo 21.0 + 50.0 placebo 21.0 + 75.0 placebo 21.5 + 100.0 placebo 22.0 + 0.0 treatment 11.0 + 25.0 treatment 11.0 + 50.0 treatment 11.0 + 75.0 treatment 11.0 + 100.0 treatment 11.0 + Name: b2, dtype: float64 + + Taken together, these data tell us that `b2` is present in significantly + higher abundance in the placebo group samples than in the treatment group + samples. + + """ + if not isinstance(table, pd.DataFrame): + raise TypeError('`table` must be a `pd.DataFrame`, ' + 'not %r.' % type(table).__name__) + if not isinstance(grouping, pd.Series): + raise TypeError('`grouping` must be a `pd.Series`,' + ' not %r.' % type(grouping).__name__) + + if np.any(table <= 0): + raise ValueError('Cannot handle zeros or negative values in `table`. ' + 'Use pseudocounts or ``multiplicative_replacement``.' + ) + + if not 0 < alpha < 1: + raise ValueError('`alpha`=%f is not within 0 and 1.' % alpha) + + if not 0 < tau < 1: + raise ValueError('`tau`=%f is not within 0 and 1.' % tau) + + if not 0 < theta < 1: + raise ValueError('`theta`=%f is not within 0 and 1.' % theta) + + if multiple_comparisons_correction is not None: + if multiple_comparisons_correction != 'holm-bonferroni': + raise ValueError('%r is not an available option for ' + '`multiple_comparisons_correction`.' + % multiple_comparisons_correction) + + if (grouping.isnull()).any(): + raise ValueError('Cannot handle missing values in `grouping`.') + + if (table.isnull()).any().any(): + raise ValueError('Cannot handle missing values in `table`.') + + percentiles = list(percentiles) + for percentile in percentiles: + if not 0.0 <= percentile <= 100.0: + raise ValueError('Percentiles must be in the range [0, 100], %r ' + 'was provided.' % percentile) + + duplicates = skbio.util.find_duplicates(percentiles) + if duplicates: + formatted_duplicates = ', '.join(repr(e) for e in duplicates) + raise ValueError('Percentile values must be unique. The following' + ' value(s) were duplicated: %s.' % + formatted_duplicates) + + groups = np.unique(grouping) + num_groups = len(groups) + + if num_groups == len(grouping): + raise ValueError( + "All values in `grouping` are unique. This method cannot " + "operate on a grouping vector with only unique values (e.g., " + "there are no 'within' variance because each group of samples " + "contains only a single sample).") + + if num_groups == 1: + raise ValueError( + "All values the `grouping` are the same. This method cannot " + "operate on a grouping vector with only a single group of samples" + "(e.g., there are no 'between' variance because there is only a " + "single group).") + + if significance_test is None: + significance_test = scipy.stats.f_oneway + + table_index_len = len(table.index) + grouping_index_len = len(grouping.index) + mat, cats = table.align(grouping, axis=0, join='inner') + if (len(mat) != table_index_len or len(cats) != grouping_index_len): + raise ValueError('`table` index and `grouping` ' + 'index must be consistent.') + + n_feat = mat.shape[1] + + _logratio_mat = _log_compare(mat.values, cats.values, significance_test) + logratio_mat = _logratio_mat + _logratio_mat.T + + # Multiple comparisons + if multiple_comparisons_correction == 'holm-bonferroni': + logratio_mat = np.apply_along_axis(_holm_bonferroni, + 1, logratio_mat) + np.fill_diagonal(logratio_mat, 1) + W = (logratio_mat < alpha).sum(axis=1) + c_start = W.max() / n_feat + if c_start < theta: + reject = np.zeros_like(W, dtype=bool) + else: + # Select appropriate cutoff + cutoff = c_start - np.linspace(0.05, 0.25, 5) + prop_cut = np.array([(W > n_feat*cut).mean() for cut in cutoff]) + dels = np.abs(prop_cut - np.roll(prop_cut, -1)) + dels[-1] = 0 + + if (dels[0] < tau) and (dels[1] < tau) and (dels[2] < tau): + nu = cutoff[1] + elif (dels[0] >= tau) and (dels[1] < tau) and (dels[2] < tau): + nu = cutoff[2] + elif (dels[1] >= tau) and (dels[2] < tau) and (dels[3] < tau): + nu = cutoff[3] + else: + nu = cutoff[4] + reject = (W >= nu*n_feat) + + feat_ids = mat.columns + ancom_df = pd.DataFrame( + {'W': pd.Series(W, index=feat_ids), + 'Reject null hypothesis': pd.Series(reject, index=feat_ids)}) + + if len(percentiles) == 0: + return ancom_df, pd.DataFrame() + else: + data = [] + columns = [] + for group in groups: + feat_dists = mat[cats == group] + for percentile in percentiles: + columns.append((percentile, group)) + data.append(np.percentile(feat_dists, percentile, axis=0)) + columns = pd.MultiIndex.from_tuples(columns, + names=['Percentile', 'Group']) + percentile_df = pd.DataFrame( + np.asarray(data).T, columns=columns, index=feat_ids) + return ancom_df, percentile_df + + +def _holm_bonferroni(p): + """ Performs Holm-Bonferroni correction for pvalues + to account for multiple comparisons + + Parameters + --------- + p: numpy.array + array of pvalues + + Returns + ------- + numpy.array + corrected pvalues + """ + K = len(p) + sort_index = -np.ones(K, dtype=np.int64) + sorted_p = np.sort(p) + sorted_p_adj = sorted_p*(K-np.arange(K)) + for j in range(K): + idx = (p == sorted_p[j]) & (sort_index < 0) + num_ties = len(sort_index[idx]) + sort_index[idx] = np.arange(j, (j+num_ties), dtype=np.int64) + + sorted_holm_p = [min([max(sorted_p_adj[:k]), 1]) + for k in range(1, K+1)] + holm_p = [sorted_holm_p[sort_index[k]] for k in range(K)] + return holm_p + + +def _log_compare(mat, cats, + significance_test=scipy.stats.ttest_ind): + """ Calculates pairwise log ratios between all features and performs a + significiance test (i.e. t-test) to determine if there is a significant + difference in feature ratios with respect to the variable of interest. + + Parameters + ---------- + mat: np.array + rows correspond to samples and columns correspond to + features (i.e. OTUs) + cats: np.array, float + Vector of categories + significance_test: function + statistical test to run + + Returns: + -------- + log_ratio : np.array + log ratio pvalue matrix + """ + r, c = mat.shape + log_ratio = np.zeros((c, c)) + log_mat = np.log(mat) + cs = np.unique(cats) + + def func(x): + return significance_test(*[x[cats == k] for k in cs]) + + for i in range(c-1): + ratio = (log_mat[:, i].T - log_mat[:, i+1:].T).T + m, p = np.apply_along_axis(func, + axis=0, + arr=ratio) + log_ratio[i, i+1:] = np.squeeze(np.array(p.T)) + return log_ratio + + +def _gram_schmidt_basis(n): + """ + Builds clr transformed basis derived from + gram schmidt orthogonalization + + Parameters + ---------- + n : int + Dimension of the Aitchison simplex + """ + basis = np.zeros((n, n-1)) + for j in range(n-1): + i = j + 1 + e = np.array([(1/i)]*i + [-1] + + [0]*(n-i-1))*np.sqrt(i/(i+1)) + basis[:, j] = e + return basis.T + + +@experimental(as_of="0.5.5") +def sbp_basis(sbp): + r""" + Builds an orthogonal basis from a sequential binary partition (SBP). As + explained in [1]_, the SBP is a hierarchical collection of binary + divisions of compositional parts. The child groups are divided again until + all groups contain a single part. The SBP can be encoded in a + :math:`(D - 1) \times D` matrix where, for each row, parts can be grouped + by -1 and +1 tags, and 0 for excluded parts. The `sbp_basis` method was + originally derived from function `gsi.buildilrBase()` found in the R + package `compositions` [2]_. The ith balance is computed as follows + + .. math:: + b_i = \sqrt{ \frac{r_i s_i}{r_i+s_i} } + \ln \left( \frac{g(x_{r_i})}{g(x_{s_i})} \right) + + where :math:`b_i` is the ith balance corresponding to the ith row in the + SBP, :math:`r_i` and :math:`s_i` and the number of respectively `+1` and + `-1` labels in the ith row of the SBP and where :math:`g(x) = + (\prod\limits_{i=1}^{D} x_i)^{1/D}` is the geometric mean of :math:`x`. + + Parameters + ---------- + sbp: np.array, int + A contrast matrix, also known as a sequential binary partition, where + every row represents a partition between two groups of features. A part + labelled `+1` would correspond to that feature being in the numerator + of the given row partition, a part labelled `-1` would correspond to + features being in the denominator of that given row partition, and `0` + would correspond to features excluded in the row partition. + + Returns + ------- + numpy.array + An orthonormal basis in the Aitchison simplex + + Examples + -------- + >>> import numpy as np + >>> sbp = np.array([[1, 1,-1,-1,-1], + ... [1,-1, 0, 0, 0], + ... [0, 0, 1,-1,-1], + ... [0, 0, 0, 1,-1]]) + ... + >>> sbp_basis(sbp) + array([[ 0.31209907, 0.31209907, 0.12526729, 0.12526729, 0.12526729], + [ 0.36733337, 0.08930489, 0.18112058, 0.18112058, 0.18112058], + [ 0.17882092, 0.17882092, 0.40459293, 0.11888261, 0.11888261], + [ 0.18112058, 0.18112058, 0.18112058, 0.36733337, 0.08930489]]) + + References + ---------- + .. [1] Parent, S.É., Parent, L.E., Egozcue, J.J., Rozane, D.E., + Hernandes, A., Lapointe, L., Hébert-Gentile, V., Naess, K., + Marchand, S., Lafond, J., Mattos, D., Barlow, P., Natale, W., 2013. + The plant ionome revisited by the nutrient balance concept. + Front. Plant Sci. 4, 39, http://dx.doi.org/10.3389/fpls.2013.00039. + .. [2] van den Boogaart, K. Gerald, Tolosana-Delgado, Raimon and Bren, + Matevz, 2014. `compositions`: Compositional Data Analysis. R package + version 1.40-1. https://CRAN.R-project.org/package=compositions. + """ + + n_pos = (sbp == 1).sum(axis=1) + n_neg = (sbp == -1).sum(axis=1) + psi = np.zeros(sbp.shape) + for i in range(0, sbp.shape[0]): + psi[i, :] = sbp[i, :] * np.sqrt((n_neg[i] / n_pos[i])**sbp[i, :] / + np.sum(np.abs(sbp[i, :]))) + return clr_inv(psi) + + +def _check_orthogonality(basis): + """ + Checks to see if basis is truly orthonormal in the + Aitchison simplex + + Parameters + ---------- + basis: numpy.ndarray + basis in the Aitchison simplex + """ + basis = np.atleast_2d(basis) + if not np.allclose(inner(basis, basis), np.identity(len(basis)), + rtol=1e-4, atol=1e-6): + raise ValueError("Aitchison basis is not orthonormal") diff --git a/grimer/grimer.py b/grimer/grimer.py index 7aaaa83..5b232a9 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -220,6 +220,8 @@ def main(): ############ _d_ : data -> auxiliar containers to be used/shared among plots ############ usually by copying and/or transforming values into a _p_ container + cds_p_contaminants = generate_cds_plot_contaminants(table, tax, contaminants) + # _p_ # df: index (unique observations), col|..., tax|..., aux|ref # this cds an exeption and contains data to plot (col|) and auxiliary data (tax|) @@ -282,6 +284,10 @@ def main(): ele["infopanel"] = {} ele["infopanel"]["textarea"] = plot_infopanel() + # contaminants + ele["contaminants"] = {} + ele["contaminants"]["fig"], ele["contaminants"]["filter"] = plot_contaminants(table, cds_p_contaminants) + # mgnify ele["mgnify"] = {} if cds_p_mgnify: @@ -355,6 +361,7 @@ def main(): cds_p_decontam, cds_p_decontam_models, cds_d_decontam, + cds_p_contaminants, table.ranks(), min_obs_perc, max_total_count, diff --git a/grimer/layout.py b/grimer/layout.py index 50fa659..0d98905 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -19,6 +19,7 @@ def make_layout(ele, version, logo_path, title): width=top_panel_width_sides) info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] + info_tabs.append(Panel(child=column(ele["contaminants"]["fig"]), title="CC")) if ele["mgnify"]["fig"]: info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], row(ele["mgnify"]["wid"]["biome_spinner"], ele["mgnify"]["wid"]["help_button"])), title="MGNify")) diff --git a/grimer/plots.py b/grimer/plots.py index 079d17e..7f0d112 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -420,6 +420,40 @@ def plot_decontam_widgets(): "help_button": help_button(title="DECONTAM", text=help_text, align="start")} +def plot_contaminants(table, cds_p_contaminants): + contaminants_fig = figure(x_range=table.ranks(), height=150, width=300, + tools="save,wheel_zoom,reset") + + contaminants_filter = IndexFilter(indices=[]) + cds_view_contaminants = CDSView(source=cds_p_contaminants, filters=[contaminants_filter]) + + fixed_bar_options = ["direct", "child", "parent"] + palette = ["blue", "red", "black"] + contaminants_fig.vbar_stack(fixed_bar_options, + x="rank", + width=1, + source=cds_p_contaminants, + view=cds_view_contaminants, + color=palette, + line_color=None, # to avoid printing zeros + fill_alpha=[1, 0.3, 0.3]) + + return contaminants_fig, contaminants_filter + + +def plot_contaminants_widgets(): + pvalue_text = Paragraph(text="P-value") + pvalue_input = TextInput(value="", width=180, align='end') + + help_text = """ +contaminants explained +""" + + return {"pvalue_text": pvalue_text, + "pvalue_input": pvalue_input, + "help_button": help_button(title="Common Contaminants", text=help_text, align="start")} + + def plot_mgnify(cds_p_mgnify): mgnify_fig = figure(height=150, width=300, tools="save,wheel_zoom,reset") diff --git a/grimer/source.py b/grimer/source.py index 7c48e19..111b64c 100644 --- a/grimer/source.py +++ b/grimer/source.py @@ -3,75 +3,74 @@ class Source: def __init__(self, file: str=None, ids: list=[]): - # Only leaf ids/nodes - self.ids = set() - self.lineage = set() - - # {id: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)} - self.refs = {} + self.ids = {} # {refid: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)}} + self.children = {} # {child_id: set(refids)} + self.parents = {} # {parent_id: set(refids)} if file is not None: self.parse(file) elif ids: - self.ids.update(ids) - self.lineage.update(ids) + for i in ids: + self.add(i) def __repr__(self): args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] return 'Source({})'.format(', '.join(args)) + def add(self, i, ref: str=None, desc: str=None): + if i not in self.ids: + self.ids[i] = {} + if ref is not None: + if ref not in self.ids[i]: + self.ids[i][ref] = set() + if desc is not None: + self.ids[i][ref].add(desc) + + def add_child(self, child, refid): + if child not in self.children: + self.children[child] = set() + self.children[child].add(refid) + + def add_parent(self, parent, refid): + if parent not in self.parents: + self.parents[parent] = set() + self.parents[parent].add(refid) + def parse(self, file): with open(file, 'r') as fh: if file.endswith(".yml") or file.endswith(".yaml"): src = yaml.safe_load(fh) for desc, val in src.items(): for ref, v in val.items(): - str_ids = list(map(str, v["ids"])) - self.ids.update(str_ids) - self.lineage.update(str_ids) - for i in str_ids: - self.add_refs_desc(i, (ref, v["url"]), desc) + for i in map(str, v["ids"]): + self.add(i, (ref, v["url"]), desc) else: for line in fh: main_id = line.rstrip() - self.ids.add(main_id) - self.lineage.add(main_id) - - def update_lineage(self, ids): - self.lineage.update(ids) + self.add(main_id) def update_taxids(self, taxid_updated): - # Update taxonomy entries or convert names to taxid for node, upd_node in taxid_updated.items(): if upd_node is not None and upd_node != node: print("Updated taxonomic node: " + node + " -> " + upd_node) + self.add(upd_node) + self.ids[upd_node].update(self.ids[node]) self.ids.discard(node) - self.ids.add(upd_node) - self.lineage.discard(node) - self.lineage.add(upd_node) - if node in self.refs: - self.refs[upd_node] = self.refs.pop(node) - - def add_refs_desc(self, i, ref, desc): - if i not in self.refs: - self.refs[i] = {} - if ref not in self.refs[i]: - self.refs[i][ref] = set() - if desc is not None: - self.refs[i][ref].add(desc) - - def get_refs_desc(self, i): - return self.refs[i] if i in self.refs else {} - def get_refs(self, i): - return list(self.refs[i].keys()) if i in self.refs else () + def get_refs_desc(self, i, direct: bool=False, children: bool=False, parents: bool=False): + refs_desc = {} + if direct and i in self.ids: + refs_desc.update(self.ids[i]) + if children and i in self.children: + for refid in self.children[i]: + refs_desc.update(self.ids[refid]) + if parents and i in self.parents: + for refid in self.parents[i]: + refs_desc.update(self.ids[refid]) + return refs_desc - def get_refs_count(self, i): - return len(self.refs[i]) if i in self.refs else 0 + def get_refs_count(self, i, direct: bool=False, children: bool=False, parents: bool=False): + return len(self.get_refs_desc(i, direct, children, parents)) - def update_refs(self, taxid_parent_rank): - for taxid, parent_taxid in taxid_parent_rank.items(): - if parent_taxid is not None and taxid in self.refs: - for i in self.refs[taxid]: - for r in self.refs[taxid][i]: - self.add_refs_desc(parent_taxid, i, r) + def lineage(self): + return set(list(self.ids.keys()) + list(self.children.keys()) + list(self.parents.keys())) diff --git a/grimer/sourceold.py b/grimer/sourceold.py new file mode 100644 index 0000000..e9d73e7 --- /dev/null +++ b/grimer/sourceold.py @@ -0,0 +1,77 @@ +import yaml + + +class SourceOld: + def __init__(self, file: str=None, ids: list=[]): + # Only leaf ids/nodes + self.ids = set() + self.lineage = set() + + # {id: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)} + self.refs = {} + + if file is not None: + self.parse(file) + elif ids: + self.ids.update(ids) + self.lineage.update(ids) + + def __repr__(self): + args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] + return 'Source({})'.format(', '.join(args)) + + def parse(self, file): + with open(file, 'r') as fh: + if file.endswith(".yml") or file.endswith(".yaml"): + src = yaml.safe_load(fh) + for desc, val in src.items(): + for ref, v in val.items(): + str_ids = list(map(str, v["ids"])) + self.ids.update(str_ids) + self.lineage.update(str_ids) + for i in str_ids: + self.add_refs_desc(i, (ref, v["url"]), desc) + else: + for line in fh: + main_id = line.rstrip() + self.ids.add(main_id) + self.lineage.add(main_id) + + def update_lineage(self, ids): + self.lineage.update(ids) + + def update_taxids(self, taxid_updated): + # Update taxonomy entries or convert names to taxid + for node, upd_node in taxid_updated.items(): + if upd_node is not None and upd_node != node: + print("Updated taxonomic node: " + node + " -> " + upd_node) + self.ids.discard(node) + self.ids.add(upd_node) + self.lineage.discard(node) + self.lineage.add(upd_node) + if node in self.refs: + self.refs[upd_node] = self.refs.pop(node) + + def add_refs_desc(self, i, ref, desc): + if i not in self.refs: + self.refs[i] = {} + if ref not in self.refs[i]: + self.refs[i][ref] = set() + if desc is not None: + self.refs[i][ref].add(desc) + + def get_refs_desc(self, i): + return self.refs[i] if i in self.refs else {} + + def get_refs(self, i): + return list(self.refs[i].keys()) if i in self.refs else () + + def get_refs_count(self, i): + return len(self.refs[i]) if i in self.refs else 0 + + def update_refs(self, taxid_parent_rank): + for taxid, parent_taxid in taxid_parent_rank.items(): + if parent_taxid is not None and taxid in self.refs: + for i in self.refs[taxid]: + for r in self.refs[taxid][i]: + self.add_refs_desc(parent_taxid, i, r) diff --git a/grimer/utils.py b/grimer/utils.py index a064184..55a86ef 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -508,30 +508,32 @@ def parse_sources(cfg, tax, ranks): references = {} for desc, sf in cfg["sources"]["contaminants"].items(): contaminants[desc] = Source(file=sf) - - # Update taxids / get taxid from name if tax: + # Update taxids / get taxid from name contaminants[desc].update_taxids(update_tax_nodes(contaminants[desc].ids, tax)) - ids = contaminants[desc].ids - - # lineage of all children nodes - for i in ids: + for i in list(contaminants[desc].ids.keys()): + # lineage of all children nodes (without itself) for lin in map(lambda txid: tax.lineage(txid, root_node=i), tax.leaves(i)): - contaminants[desc].update_lineage(lin) - contaminants[desc].update_refs({i: l for l in lin}) + for l in lin[1:]: + contaminants[desc].add_child(l, i) + # lineage of all parent nodes (without itself) + for l in tax.lineage(i)[:-1]: + contaminants[desc].add_parent(l, i) for desc, sf in cfg["sources"]["references"].items(): references[desc] = Source(file=sf) # Update lineage and refs based on given taxonomy if tax: + # Update taxids / get taxid from name references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) - ids = references[desc].ids - - # lineage of all children nodes - for i in ids: + for i in list(references[desc].ids.keys()): + # lineage of all children nodes (without itself) for lin in map(lambda txid: tax.lineage(txid, root_node=i), tax.leaves(i)): - references[desc].update_lineage(lin) - references[desc].update_refs({i: l for l in lin}) + for l in lin[1:]: + references[desc].add_child(l, i) + # lineage of all parent nodes (without itself) + for l in tax.lineage(i)[:-1]: + references[desc].add_parent(l, i) return contaminants, references From e89b0b77c8410fa2297f675dee3b55b0b42e4c2b Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Mon, 27 Sep 2021 16:04:22 +0200 Subject: [PATCH 04/50] adapt source to rest, select annotation file to plot --- grimer/callbacks.py | 7 +++--- grimer/cds.py | 22 ++++++++++++----- grimer/grimer.py | 6 +++-- grimer/layout.py | 2 +- grimer/plots.py | 58 +++++++++++++++++++++++++++++++++++---------- grimer/source.py | 3 --- 6 files changed, 69 insertions(+), 29 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index d36eb53..e952473 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -320,6 +320,7 @@ def link_obstable_samplebars(ele, contaminants_callback = CustomJS( args=dict(contaminants_fig=ele["contaminants"]["fig"], contaminants_filter=ele["contaminants"]["filter"], + contaminant_select=ele["contaminants"]["wid"]["contaminant_select"], cds_p_obstable=cds_p_obstable, cds_p_contaminants=cds_p_contaminants, active_ranks=active_ranks), @@ -334,14 +335,12 @@ def link_obstable_samplebars(ele, for(let r = 0; r < active_ranks.length; r++){ // get taxid of the rank let rank_obs = cds_p_obstable.data["tax|"+active_ranks[r]][row]; - if(cds_p_contaminants.data["obs"][i]==rank_obs && - cds_p_contaminants.data["cont"][i]=="CC Bacteria"){ + if(cds_p_contaminants.data["obs"][i]==rank_obs && cds_p_contaminants.data["cont"][i]==contaminant_select.value){ indices.push(i); } } } } - console.log(indices); contaminants_filter.indices = indices; cds_p_contaminants.change.emit(); ''') @@ -361,7 +360,7 @@ def link_obstable_samplebars(ele, ele["samplebars"]["wid"]["y1_select"].js_on_change('value', bar_select_callback, change_y_counts_label_callback, sort_groupby_callback) ele["samplebars"]["wid"]["y2_select"].js_on_change('value', plot_obs_callback, change_y_obs_label_callback, sort_groupby_callback) ele["mgnify"]["wid"]["biome_spinner"].js_on_change('value', mgnify_callback) - + ele["contaminants"]["wid"]["contaminant_select"].js_on_change('value', contaminants_callback) def link_heatmap_widgets(ele, cds_d_samples, diff --git a/grimer/cds.py b/grimer/cds.py index 91f2d13..44d131c 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -22,7 +22,9 @@ def generate_dict_taxname(tax, taxids): def generate_cds_plot_contaminants(table, tax, contaminants): - + # Stacked list of contaminants for each observation + # index -> observations (repeated) + # columns -> "rank", "cont", "direct", "child", "parent" clist = [] for rank in table.ranks(): for obs in table.observations(rank): @@ -34,6 +36,9 @@ def generate_cds_plot_contaminants(table, tax, contaminants): clist.append([obs, rank, desc, direct, child, parent]) df_contaminants = pd.DataFrame(clist, columns=["obs", "rank", "cont", "direct", "child", "parent"]) + df_contaminants.set_index('obs', inplace=True) + + df_contaminants.sort_values(by="cont", ascending=False, inplace=True) print_df(df_contaminants, "df_contaminants -> cds_p_contaminants") return ColumnDataSource(df_contaminants) @@ -52,14 +57,13 @@ def generate_cds_annotations(table, contaminants, references, controls, decontam df_rank["decontam"] = decontam.get_contaminants(rank, df_rank.index) for desc, ref in references.items(): - df_rank[desc] = table.observations(rank).isin(ref.lineage()) - + df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) >= 1 for desc, cont in contaminants.items(): - df_rank[desc] = table.observations(rank).isin(cont.lineage()) + df_rank[desc] = table.observations(rank).map(lambda x: cont.get_refs_count(x, direct=True)) >= 1 if controls: for desc, ctrl in controls.items(): - df_rank[desc] = table.observations(rank).isin(ctrl.lineage) + df_rank[desc] = table.observations(rank).map(lambda x: ctrl.get_refs_count(x, direct=True)) >= 1 df_rank = pd.DataFrame(df_rank.stack(), columns=["val"]).reset_index(1) df_rank.rename(columns={"level_1": "annot"}, inplace=True) @@ -192,7 +196,7 @@ def generate_cds_samples(table, references, contaminants, controls, decontam): for sources in source_list: for desc, src in sources: for rank in table.ranks(): - idx = table.observations(rank).isin(src.lineage()) + idx = table.observations(rank).map(lambda x: src.get_refs_count(x, direct=True)) >= 1 df_samples["cnt|" + rank + "|" + desc] = table.data[rank][table.observations(rank)[idx]].sum(axis=1) if decontam: @@ -201,6 +205,9 @@ def generate_cds_samples(table, references, contaminants, controls, decontam): idx = table.observations(rank).isin(contaminants) df_samples["cnt|" + rank + "|decontam"] = table.data[rank][table.observations(rank)[idx]].sum(axis=1) + # fill NaN with zero so bars do not "dissapear" when plotting + df_samples.fillna(0, inplace=True) + print_df(df_samples, "df_samples -> cds_d_samples") return ColumnDataSource(df_samples) @@ -280,6 +287,9 @@ def generate_cds_sampleobs(table): df_sampleobs = pd.DataFrame(index=table.samples) for rank in table.ranks(): df_sampleobs = pd.concat([df_sampleobs, table.data[rank]], axis=1) + + # fill NaN with zero so bars do not "dissapear" when plotting + df_sampleobs.fillna(0, inplace=True) print_df(df_sampleobs, "df_sampleobs -> cds_d_sampleobs") return ColumnDataSource(df_sampleobs) diff --git a/grimer/grimer.py b/grimer/grimer.py index 5b232a9..4570f9a 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -220,7 +220,6 @@ def main(): ############ _d_ : data -> auxiliar containers to be used/shared among plots ############ usually by copying and/or transforming values into a _p_ container - cds_p_contaminants = generate_cds_plot_contaminants(table, tax, contaminants) # _p_ # df: index (unique observations), col|..., tax|..., aux|ref @@ -228,6 +227,8 @@ def main(): cds_p_obstable = generate_cds_obstable(table, tax, contaminants, references, controls, control_samples, decontam) # df: index (unique sample-ids), aux|..., bar|..., tax|... cds_p_samplebars = generate_cds_bars(table) + # stacked: + cds_p_contaminants = generate_cds_plot_contaminants(table, tax, contaminants) # matrix: index (unique sample-ids), concentrations, controls, counts cds_p_decontam = generate_cds_plot_decontam(decontam) if decontam else None # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} @@ -286,7 +287,8 @@ def main(): # contaminants ele["contaminants"] = {} - ele["contaminants"]["fig"], ele["contaminants"]["filter"] = plot_contaminants(table, cds_p_contaminants) + ele["contaminants"]["fig"], ele["contaminants"]["filter"] = plot_contaminants(table, cds_p_contaminants, dict_d_taxname) + ele["contaminants"]["wid"] = plot_contaminants_widgets(contaminants) # mgnify ele["mgnify"] = {} diff --git a/grimer/layout.py b/grimer/layout.py index 0d98905..587d1d4 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -19,7 +19,7 @@ def make_layout(ele, version, logo_path, title): width=top_panel_width_sides) info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] - info_tabs.append(Panel(child=column(ele["contaminants"]["fig"]), title="CC")) + info_tabs.append(Panel(child=column(ele["contaminants"]["fig"], row(ele["contaminants"]["wid"]["contaminant_select"], ele["contaminants"]["wid"]["help_button"])), title="CC")) if ele["mgnify"]["fig"]: info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], row(ele["mgnify"]["wid"]["biome_spinner"], ele["mgnify"]["wid"]["help_button"])), title="MGNify")) diff --git a/grimer/plots.py b/grimer/plots.py index 7f0d112..b6e5d5f 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -15,14 +15,23 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): y_range=Range1d(start=0, end=max_total_count), plot_height=400, sizing_mode="stretch_width", - tools="box_zoom,reset,save,hover", - tooltips=[("Sample", "@index"), ("Value", "@$name")]) + tools="box_zoom,reset,save") + + samplebars_fig.add_tools(HoverTool( + tooltips=[ + ("Sample", "@index"), + ("Value", "@$name") + ], + mode="mouse", + point_policy="follow_mouse", + )) fixed_bar_options = ["selected", "others", "unassigned"] vbar_ren = samplebars_fig.vbar_stack(["bar|" + f for f in fixed_bar_options], x="aux|factors", width=1, source=cds_p_samplebars, + line_color=None, # to avoid printing small border for zeros color=make_color_palette(len(fixed_bar_options))) # Second y-axis to plot observations @@ -114,6 +123,7 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna vbar_ren = obsbars_fig.vbar_stack(bars, x="factors", source=cds_p_obsbars, width=1, + #line_color=None, # to avoid printing small border for zeros #color=make_color_palette(top_obs_bars, linear=True) + ("#868b8e", "#eeede7")) color=make_color_palette(top_obs_bars) + ("#868b8e", "#eeede7")) @@ -420,37 +430,59 @@ def plot_decontam_widgets(): "help_button": help_button(title="DECONTAM", text=help_text, align="start")} -def plot_contaminants(table, cds_p_contaminants): +def plot_contaminants(table, cds_p_contaminants, dict_d_taxname): contaminants_fig = figure(x_range=table.ranks(), height=150, width=300, - tools="save,wheel_zoom,reset") + tools="save,reset") + + # Need to pass dict_d_taxname inside a one column data + taxid_name_custom = CustomJSHover( + args=dict(dict_d_taxname=ColumnDataSource(dict(dict_d_taxname=[dict_d_taxname]))), + code="return dict_d_taxname.data.dict_d_taxname[0][value]; // value holds the @taxid" + ) + # Add custom tooltip for heatmap (taxid->name) + contaminants_fig.add_tools(HoverTool( + tooltips=[ + ('Observation', '@obs{custom}'), + ('# reported (directly)', '@direct'), + (' as children', '@child'), + (' as parent', '@parent'), + ], + mode="mouse", + point_policy="follow_mouse", + formatters={"@obs": taxid_name_custom} + )) contaminants_filter = IndexFilter(indices=[]) cds_view_contaminants = CDSView(source=cds_p_contaminants, filters=[contaminants_filter]) fixed_bar_options = ["direct", "child", "parent"] - palette = ["blue", "red", "black"] + palette = ["red", "orange", "black"] contaminants_fig.vbar_stack(fixed_bar_options, x="rank", width=1, source=cds_p_contaminants, view=cds_view_contaminants, color=palette, - line_color=None, # to avoid printing zeros + line_color=None, # to avoid printing small border for zeros fill_alpha=[1, 0.3, 0.3]) - return contaminants_fig, contaminants_filter + contaminants_fig.xaxis.major_label_orientation = "vertical" + contaminants_fig.xgrid.grid_line_color = None + contaminants_fig.xaxis.minor_tick_line_color = None + contaminants_fig.yaxis.minor_tick_line_color = None + contaminants_fig.xaxis.major_tick_line_color = None + contaminants_fig.yaxis.major_tick_line_color = None + contaminants_fig.yaxis.axis_label = "# reported" + return contaminants_fig, contaminants_filter -def plot_contaminants_widgets(): - pvalue_text = Paragraph(text="P-value") - pvalue_input = TextInput(value="", width=180, align='end') +def plot_contaminants_widgets(contaminants): + contaminant_select = Select(value=list(contaminants.keys())[0], width=200, options=list(contaminants.keys())) help_text = """ contaminants explained """ - - return {"pvalue_text": pvalue_text, - "pvalue_input": pvalue_input, + return {"contaminant_select": contaminant_select, "help_button": help_button(title="Common Contaminants", text=help_text, align="start")} diff --git a/grimer/source.py b/grimer/source.py index 111b64c..333a4f9 100644 --- a/grimer/source.py +++ b/grimer/source.py @@ -71,6 +71,3 @@ def get_refs_desc(self, i, direct: bool=False, children: bool=False, parents: bo def get_refs_count(self, i, direct: bool=False, children: bool=False, parents: bool=False): return len(self.get_refs_desc(i, direct, children, parents)) - - def lineage(self): - return set(list(self.ids.keys()) + list(self.children.keys()) + list(self.parents.keys())) From 1f4a1602095cd582fd63e3eefb92c7e7f686fb62 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 29 Sep 2021 15:20:42 +0200 Subject: [PATCH 05/50] unification of contamination and references --- config/default.yaml | 27 ++----- {sources => files}/README.md | 52 +++++++------ .../cc_bacteria.yml => files/contaminants.yml | 14 ++++ .../oral.yml => files/human-related.yml | 28 ++++++- .../taxa_counts_top10.tsv => files/mgnify.tsv | 0 grimer/callbacks.py | 4 +- grimer/cds.py | 66 +++++++---------- grimer/grimer.py | 25 +++---- grimer/plots.py | 11 +-- grimer/source.py | 2 +- grimer/utils.py | 74 ++++++++----------- sources/contaminants/cc_eukaryota.yml | 4 - sources/contaminants/cc_viruses.yml | 10 --- sources/references/human-related.yml | 13 ---- sources/references/skin.yml | 14 ---- 15 files changed, 155 insertions(+), 189 deletions(-) rename {sources => files}/README.md (66%) rename sources/contaminants/cc_bacteria.yml => files/contaminants.yml (59%) rename sources/references/oral.yml => files/human-related.yml (72%) rename sources/mgnify/taxa_counts_top10.tsv => files/mgnify.tsv (100%) delete mode 100644 sources/contaminants/cc_eukaryota.yml delete mode 100644 sources/contaminants/cc_viruses.yml delete mode 100644 sources/references/human-related.yml delete mode 100644 sources/references/skin.yml diff --git a/config/default.yaml b/config/default.yaml index 88db139..676507c 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -1,24 +1,13 @@ -sources: - contaminants: - "CC Bacteria": "sources/contaminants/cc_bacteria.yml" - "CC Viruses": "sources/contaminants/cc_viruses.yml" - "CC Eukaryota": "sources/contaminants/cc_eukaryota.yml" - # "Custom Contaminants 1": "path/contfile1.tsv" - # "Custom Contaminants 2": "path/contfile2.tsv" - references: - "Human-Related": "sources/references/human-related.yml" - "Skin": "sources/references/skin.yml" - "Oral": "sources/references/oral.yml" - # "Custom References 1": "path/reffile1.tsv" - # "Custom References 2": "path/reffile2.tsv" - -# samples: -# controls: -# "Positve Controls": "path/file1.tsv" -# "Negative Controls": "path/file1.tsv" +annotations: + "Contaminants": "files/contaminants.yml" + "Human-related": "files/human-related.yml" + +#controls: + # "Positve Controls": "path/file1.tsv" + # "Negative Controls": "path/file1.tsv" external: - mgnify: "sources/mgnify/taxa_counts_top10.tsv" + mgnify: "files/mgnify.tsv" decontam: threshold: 0.2 # [0-1] method: "frequency" # frequency, prevalence, combined diff --git a/sources/README.md b/files/README.md similarity index 66% rename from sources/README.md rename to files/README.md index 0dc401e..8002000 100644 --- a/sources/README.md +++ b/files/README.md @@ -1,31 +1,31 @@ -# GRIMER Sources +# GRIMER References and aux. files -## File formats - -Contaminant and reference sources can be provided to grimer in two formats: +## Reference file format 1) File with a list (one per line) of taxonomic identifiers or taxonomic names -2) Formatted .yml file: +or + +2) Formatted `.yml` file: - Description/Group 1: - Description/Group 2: - url: "" - ids: [] + "General Description": + "Specific description": + url: "www.website.com?id={}" + ids: [1,2,3] The url can be a link to the entries listed on the id. Use the `{}` as a placeholder for the id. Example: `https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={}` The files should be provided in the main configuration file for grimer as follows: - sources: - contaminants: - "CUSTOM CONTAMINANTS 1": "file.txt" - "LAB CONT": "another_file.yml" - references: - "Human gut": "listofnames.txt" - "XYZ": "another.yaml" + references: + "Contaminants": "files/contaminants.yml" + "Human-related": "files/human-related.yml" + "CUSTOM CONTAMINANTS": "file.txt" + "LAB RELATED BACTERIA": "another_file.yml" + +### contaminants.yml -## Contaminants +Last update: 2021-04-01 | Organism group | Genus | Species | |----------------|-------|---------| @@ -49,20 +49,28 @@ The files should be provided in the main configuration file for grimer as follow | Viruses | 0 | 301 | 2019 Asplund, M. et al. | | Total (unique) | 201 | 625 | | -## References +### human-related.yml + +BacDive and eHOMD dump date: 2021-04-13 ```bash scripts/bacdive_download.py scripts/ehomd_download.py ``` -BacDive and eHOMD dump date: 2021-04-13 +## MGnify + +The downloaded MGnify database file should be provided in the main configuration file for grimer as follows: + + external: + mgnify: "files/mgnify.tsv" + +## mgnify.tsv -## MGNify +MGnify dump date: 2021-04-08 (latest study accession MGYS00005724) ```bash seq -f "MGYS%08g" 256 5724 | xargs -P 24 -I {} scripts/mgnify_download.py {} mgnify_dump_20210408/ > mgnify_dump_20210408.log 2>|1 | -scripts/mgnify_extract.py -f mgnify_dump_20210408 -t 10 -o taxa_counts_top10.tsv +scripts/mgnify_extract.py -f mgnify_dump_20210408 -t 10 -o files/mgnify.tsv ``` -MGnify dump date 2021-04-08 (latest study accession MGYS00005724) \ No newline at end of file diff --git a/sources/contaminants/cc_bacteria.yml b/files/contaminants.yml similarity index 59% rename from sources/contaminants/cc_bacteria.yml rename to files/contaminants.yml index 593f500..6a57d09 100644 --- a/sources/contaminants/cc_bacteria.yml +++ b/files/contaminants.yml @@ -44,3 +44,17 @@ "2017 Salter, S.J. et al.": url: "http://doi.org/10.1371/journal.pntd.0005975" ids: [50709, 299566, 1375, 2040, 507, 31988, 165779, 161492, 150247, 92793, 374, 55080, 1696, 41275, 369926, 32008, 194, 2717, 75, 10, 59732, 1716, 37914, 231454, 423604, 212791, 117563, 963, 1004300, 682522, 1357, 149698, 906, 68287, 407, 33882, 1839, 528, 376469, 84567, 335058, 28100, 838, 286, 83618, 48736, 379, 1835, 45669, 22, 28453, 13687, 40323, 1054211, 13275, 33057, 157, 213484, 29465, 1827, 265, 1386] +"Common Viral contaminants": + "2019 Asplund, M. et al.": + url: "http://doi.org/10.1016/j.cmi.2019.04.028" + ids: [12071, 742919, 11103, 31647, 12461, 10298, 10376, 10359, 11676, 129951, 10583, 31552, 10798, 11908, 585044, 518981, 1225745, 11620, 1891767, 493803, 11033, 159150, 35306, 68887, 11870, 11958, 11861, 11946, 11864, 363745, 363020, 242521, 11866, 11960, 31668, 31669, 31670, 11867, 11955, 11874, 11876, 11878, 11885, 36381, 11886, 11888, 269447, 269448, 11950, 11948, 1332312, 354090, 11884, 1352534, 1395610, 1395611, 1395612, 1395613, 1395614, 1395615, 1395616, 1395617, 1395618, 1395619, 1395620, 1341019, 11801, 11809, 1511763, 1394983, 697906, 1072204, 1148801, 1574422, 12104, 763552, 10264, 85708, 759804, 28344, 85506, 33747, 10345, 285986, 220638, 1154691, 185638, 1169627, 1045778, 185636, 72201, 345198, 176652, 1301280, 68347, 1618248, 1618254, 10288, 198112, 1454023, 1454024, 1454025, 1278278, 1278246, 1278252, 1278247, 1278248, 1278249, 1278250, 1278251, 399781, 1278255, 346932, 1278261, 1278263, 1278265, 1474867, 1379694, 1521385, 1521387, 1521389, 938081, 938082, 880162, 251749, 455370, 169864, 1379788, 1608440, 642253, 642255, 1224510, 1592207, 1592212, 1592083, 1592085, 1592086, 1592088, 1592093, 1592095, 1592096, 1592081, 1843761, 1519405, 1557033, 1608451, 664785, 1435438, 1170653, 40979, 12235, 12138, 11987, 51680, 12056, 146500, 554168, 212035, 1269028, 693272, 1420594, 1094892, 1128140, 1235314, 1128143, 1128151, 1128131, 1450746, 1461100, 181522, 1424633, 1010698, 1299317, 1450749, 1416631, 1128422, 1034806, 1592112, 1592113, 1592127, 938080, 1074214, 1519385, 1519387, 1519389, 1519390, 1519395, 1519396, 1519397, 186617, 1262072, 1407671, 743583, 340016, 745107, 745102, 745100, 1416009, 1187128, 889876, 760732, 1243183, 1229760, 1481186, 1505225, 1560342, 233894, 115987, 260149, 227470, 926067, 1127514, 1296654, 294382, 1486657, 1084719, 10756, 1486662, 1285382, 1497851, 1127515, 145579, 263375, 764562, 1133292, 1133022, 242527, 260373, 279280, 644524, 242861, 1132026, 1357714, 1197951, 1327981, 1327976, 1327979, 1327992, 1328030, 1327990, 1327980, 1327972, 1327982, 1327995, 1327983, 1327970, 1327971, 756279, 1327977, 1327993, 1328029, 1327975, 1327974, 1327985, 756280, 756282, 1527524, 1540094, 1042123, 541865, 1567016, 765765, 1176422, 1327037, 1162295, 1141135, 1141136, 335924, 536444, 929832, 682650, 1137745, 536473, 749413, 1477406, 1048515, 1048516, 1048517, 1048520, 1048521, 1537091, 1264700, 1609634, 1455074, 414970, 10863, 10864, 1222338, 1147148, 1237364, 1414766, 1977402, 948870, 1524881, 10665, 10760, 1147094, 1429767, 925983, 925984, 1527519, 1527506, 1229753, 1540097, 1540098, 1054461, 1391223, 294631, 1325731, 908819, 1458858, 1458842, 90963, 1536592, 1527515, 551895, 1129191, 139872, 201847, 287412, 1262517, 754044, 1385658, 1176423, 889949, 446529, 1034128, 1056830, 1089119, 1486472, 1034111, 205879, 1340709, 1567475, 1472912, 1204539, 1399915, 1283076, 1283077, 1168479, 1168478, 440250, 400567, 994601, 1465639, 889956, 445700, 444862, 536454, 445688, 444861, 1229794, 1229793, 1229792, 1229791, 1229790, 1229789, 1229786, 1229787, 1229788, 1229784, 1229782, 376758, 1498188, 504501, 504553, 1235647, 1235648, 1235649, 1235650, 1235653, 1235654, 1235655, 1235656, 1235657, 877240, 754052, 1316739, 347326, 1235689, 31535, 757342, 582345, 1462581, 386793, 1204517, 347327, 1335230, 743813, 1348912, 1327964, 270673, 188350, 1541891, 169683, 998086, 1500757, 1458843, 1129146, 1279082, 1114179, 1548900, 1231048, 1548901, 1449437, 1548918, 1476390, 462590, 754048, 948071, 1481785, 1417599, 1131316, 691965, 136084, 754067, 1161935, 1173749, 1173761, 1173759, 1173762, 590739, 1406795, 1141134, 1204529, 1540099, 1168549, 866889, 1458859, 1458860, 1458861, 10761, 754060, 1524882, 1357423, 373126, 1150991, 1195080, 320843, 55510, 1434319, 320850, 369581, 537874, 1208587, 1566990, 10732, 490913, 1526550, 1340810, 756277, 753084, 753085, 756275, 1026955, 1340812, 238854, 555387, 754042, 444860, 981335, 469660, 215796, 1478972, 1385659, 926697, 336724, 278008, 1211417, 271647, 754075, 573173, 573174, 979525, 979534, 1529058, 1283071, 573176, 1589298, 1076759, 1461743, 1150989, 754058, 754051, 929835, 1414739, 754072, 1524880, 194802, 1168281, 1204514, 1188795, 331278] + "2015 Mukherjee, S. et al.": + url: "http://doi.org/10.1186/1944-3277-10-18" + ids: [10847] + "2015 Kjartansdóttir, K.R. et al.": + url: "https://doi.org/10.1073/pnas.1423756112" + ids: [322019] +"Common Eukaryotic contaminants": + "PRJNA168": + url: "https://www.ncbi.nlm.nih.gov/genome/guide/human/" + ids: [9606] diff --git a/sources/references/oral.yml b/files/human-related.yml similarity index 72% rename from sources/references/oral.yml rename to files/human-related.yml index 22e21dd..d994ffa 100644 --- a/sources/references/oral.yml +++ b/files/human-related.yml @@ -1,3 +1,16 @@ +"Human-related bacterial isolates from BacDive": + "Limbs": + url: "https://bacdive.dsmz.de/search?search=taxid:{}" + ids: [326522, 82380, 1701, 131110, 29388, 84698, 729, 69968, 28264, 200476, 58172, 490, 1280, 1290, 41276, 220685, 28449, 644, 1660, 147645, 1351, 90367, 391, 1717, 1720, 1314, 646, 29391, 732, 1365628, 90245, 495, 192066, 753, 1979962, 82633, 53363, 539, 37637, 37329, 755171, 29466, 291112, 614, 28090, 1402, 217204, 1509, 326523, 2014, 1303, 28038, 676, 105219, 13076, 66228, 504, 28189, 752, 108980, 1747, 1282, 1504, 33028, 303, 672, 28035, 1286, 485, 1311, 28188, 28132, 1328, 1506, 652, 29380, 760, 46124, 1379, 755172, 193461, 158822, 479, 68892, 479117, 33889, 670, 420404, 1305, 1697053, 71254, 310300, 47920, 669, 1245, 38289, 36740, 354351, 48296, 29318, 192, 38313, 180332, 135487, 33007, 287, 754, 29317, 1648, 1713, 1352, 550, 53437, 2054, 38284, 1667, 511, 1015, 40091, 59561, 411577, 587, 370622, 206506, 37326, 90239, 161902, 137732, 52132, 34105, 180588, 33968, 386414, 283734, 1891233, 478, 156979, 28125, 1529, 306, 123899, 220687, 620903, 1239307, 1348, 316, 28091, 178214, 84112, 44737, 487, 1536, 1273, 24, 630, 1034, 322095, 488730, 70348, 650, 43765, 43770, 39791, 115545, 150055, 411570, 196, 131111, 472, 38301, 51671, 292, 146827, 1785, 1977869, 40542, 29432, 28450, 1890675, 47312, 38875, 1710, 739, 47917, 33010, 1292, 169292, 158877, 1781, 400946, 501496, 488, 239, 361500, 470, 1430326, 29354, 82347, 65058, 714, 521520, 38303, 1513, 502790, 747, 1141657, 38304] + "Ear": + url: "https://bacdive.dsmz.de/search?search=taxid:{}" + ids: [1747, 2702, 1869190, 32002, 85698, 131111, 29388, 28037, 44750, 51671, 1353, 28264, 545, 292, 89093, 1872515, 1280, 511, 29379, 68766, 59561, 29321, 480, 1311, 285091, 727, 199591, 43263, 1313, 739, 760, 1661, 52769, 1421, 1314, 156979, 35703, 1898, 585, 87883, 90245, 123899, 306, 1895474, 670, 47770, 319939, 184870, 134375, 72557, 753, 663, 316, 1343, 217203, 267212, 678, 53364, 1014, 1776741, 93220, 1639, 666, 38313, 1652, 105219, 38287, 293, 33007, 287] + "Eye": + url: "https://bacdive.dsmz.de/search?search=taxid:{}" + ids: [40216, 28037, 1280, 490, 147645, 1351, 90367, 1824, 813, 1720, 38290, 29391, 732, 192066, 616, 161879, 753, 1304, 1655, 539, 37329, 28172, 161890, 90241, 504, 752, 253, 457921, 1871047, 1309, 154288, 280147, 485, 760, 46124, 1931, 1379, 29394, 1671023, 68892, 479, 1396, 1544416, 2035, 420404, 735, 47846, 666, 571, 2055, 1401, 1270, 34062, 545, 38284, 247, 498, 40091, 59561, 370622, 37326, 727, 945844, 1313, 180588, 1685, 1671022, 478, 1302, 134375, 477, 726, 47478, 197575, 207340, 38287, 650, 756689, 43765, 69392, 723, 72556, 187491, 472, 51671, 2047, 1177728, 46125, 29432, 480, 47312, 739, 134533, 740, 37330, 488, 1544413, 239, 483, 29354, 41202, 38304] + "Nose": + url: "https://bacdive.dsmz.de/search?search=taxid:{}" + ids: [59823, 72556, 131111, 1282, 28264, 38284, 1280, 520, 43990, 615, 727, 1328, 1313, 90367, 760, 181487, 29394, 478, 732, 40324, 33889, 306, 39950, 1304, 1673725, 65058, 74319, 1591, 90241, 105219, 504, 286802, 195105, 574] "Human-related bacterial isolates from BacDive": "Oral": url: "https://bacdive.dsmz.de/search?search=taxid:{}" @@ -15,4 +28,17 @@ "Nasal": url: "http://www.ehomd.org/?name=HOMD" ids: [553601, 406557, 497962, 1715211, 170187, 553573, 196620, 93062, 553594, 406559, 1008452, 1608882, 488223, 1203632, 857577, 727, 869269, 478, 553568, 760810, 1739522, 548470, 857573, 281310, 455227, 521004, 359787, 1095736, 374931, 1069623, 585203, 1715217, 497980, 585202, 1715123, 1069628, 553580, 1203562, 1834153, 1203627, 866630, 1415766, 553583, 158879, 760787, 548475, 453361, 406561, 451516, 869215, 553574, 553590, 282458, 1608898, 456482, 374928, 553592, 553588, 1069626, 452948, 480, 1130804, 1095746, 453362, 375177, 406556, 71421, 273036, 451515, 548474, 521005, 374930, 406560, 1035187, 1203619, 553565, 406562, 1095745, 1859695, 1069625, 857578, 585204, 1739317, 862964, 1340484, 681288, 1239793, 487213, 857581, 497963, 760791, 857574, 374933, 1739280, 869216, 406563, 453366, 574093, 516950, 1415765, 453363, 262727, 857575, 453364, 1203625, 548473, 1340485, 406558, 703339, 760746, 1340486, 656912, 189423, 1239792, 512767, 857572, 857579, 760834, 760861, 1203622, 585161, 1203557, 1203566, 373153, 1203559, 1095735, 1203561, 656913, 886289, 262728, 488221, 553571, 869309, 553577, 171101, 375432, 359786, 857576, 1236608, 1095737, 1121367, 375063, 888828, 374927, 1203624, 365659, 525381, 760809, 512769, 418127, 595501, 246201, 512566, 546342, 158878, 883103, 488222, 1008453, 857571, 1739254, 487214, 453365, 1739452, 90241, 553567, 28037, 512768, 553581, 426430, 553596, 93061, 935897, 450394, 282459, 561276, 374932, 862965] - +"Human-related bacterial isolates from BacDive": + "Skin/Nail/Hair": + url: "https://bacdive.dsmz.de/search?search=taxid:{}" + ids: [1747, 106654, 1986155, 59823, 1869190, 1270, 71999, 1283, 1276, 131110, 1648, 1656, 472, 1352, 34062, 729, 29388, 2047, 28264, 1314, 1280, 1290, 672, 59561, 1780, 33918, 37326, 29432, 1286, 1891644, 74703, 90367, 1931, 33010, 1720, 1965292, 181487, 169292, 38290, 29506, 1622, 281920, 1292, 1781, 861, 1698, 1260, 2035, 202789, 521392, 470, 663, 29382, 1659, 1288, 37923, 1655, 45254, 1753, 1261, 38289, 36740, 1273, 1347368, 33034, 1347369, 1282, 66228, 132933, 43765, 287] +"Top organisms form the human skin microbiome" : + "Bacteria": + url: "https://doi.org/10.1038/nrmicro.2017.157" + ids: [257758, 225324, 169292, 161879, 146827, 43765, 38304, 38287, 38286, 29466, 29388, 28037, 1747, 1305, 1303, 1290, 1282, 1270] + "Eukarya": + url: "https://doi.org/10.1038/nrmicro.2017.157" + ids: [2510778, 1047171, 379413, 119676, 117179, 76777, 76775, 76773, 44058, 41880, 36894, 34391, 31312, 5480, 5068, 3074, 2762] + "Viruses": + url: "https://doi.org/10.1038/nrmicro.2017.157" + ids: [185639, 746832, 10566, 493803, 10279, 746830, 746831, 46771] diff --git a/sources/mgnify/taxa_counts_top10.tsv b/files/mgnify.tsv similarity index 100% rename from sources/mgnify/taxa_counts_top10.tsv rename to files/mgnify.tsv diff --git a/grimer/callbacks.py b/grimer/callbacks.py index e952473..7eb5a74 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -335,7 +335,9 @@ def link_obstable_samplebars(ele, for(let r = 0; r < active_ranks.length; r++){ // get taxid of the rank let rank_obs = cds_p_obstable.data["tax|"+active_ranks[r]][row]; - if(cds_p_contaminants.data["obs"][i]==rank_obs && cds_p_contaminants.data["cont"][i]==contaminant_select.value){ + if(cds_p_contaminants.data["obs"][i]==rank_obs && + cds_p_contaminants.data["rank"][i]==active_ranks[r] && + cds_p_contaminants.data["annot"][i]==contaminant_select.value){ indices.push(i); } } diff --git a/grimer/cds.py b/grimer/cds.py index 44d131c..a71a2c4 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -21,29 +21,27 @@ def generate_dict_taxname(tax, taxids): return id_name -def generate_cds_plot_contaminants(table, tax, contaminants): +def generate_cds_plot_contaminants(table, tax, references): # Stacked list of contaminants for each observation # index -> observations (repeated) - # columns -> "rank", "cont", "direct", "child", "parent" + # columns -> "rank", "annot", "direct", "child", "parent" clist = [] for rank in table.ranks(): for obs in table.observations(rank): - for desc, cont in contaminants.items(): - direct = cont.get_refs_count(obs, direct=True) - child = cont.get_refs_count(obs, children=True) - parent = cont.get_refs_count(obs, parents=True) + for desc, ref in references.items(): + direct = ref.get_refs_count(obs, direct=True) + child = ref.get_refs_count(obs, children=True) + parent = ref.get_refs_count(obs, parents=True) if direct + child + parent > 0: clist.append([obs, rank, desc, direct, child, parent]) - df_contaminants = pd.DataFrame(clist, columns=["obs", "rank", "cont", "direct", "child", "parent"]) + df_contaminants = pd.DataFrame(clist, columns=["obs", "rank", "annot", "direct", "child", "parent"]) df_contaminants.set_index('obs', inplace=True) - df_contaminants.sort_values(by="cont", ascending=False, inplace=True) - print_df(df_contaminants, "df_contaminants -> cds_p_contaminants") return ColumnDataSource(df_contaminants) -def generate_cds_annotations(table, contaminants, references, controls, decontam): +def generate_cds_annotations(table, references, controls, decontam): # Stacked matrix of true annotations (omit false) # index -> taxids # columns -> rank, annot @@ -58,8 +56,6 @@ def generate_cds_annotations(table, contaminants, references, controls, decontam for desc, ref in references.items(): df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) >= 1 - for desc, cont in contaminants.items(): - df_rank[desc] = table.observations(rank).map(lambda x: cont.get_refs_count(x, direct=True)) >= 1 if controls: for desc, ctrl in controls.items(): @@ -80,7 +76,7 @@ def generate_cds_annotations(table, contaminants, references, controls, decontam return ColumnDataSource(df_annotations) -def generate_cds_obstable(table, tax, contaminants, references, controls, control_samples, decontam): +def generate_cds_obstable(table, tax, references, controls, control_samples, decontam): # index unique taxids # col|... values to plot to columns in the datatable # tax|... auxiliary lineage of taxa entries @@ -107,20 +103,9 @@ def generate_cds_obstable(table, tax, contaminants, references, controls, contro if decontam: df_rank["col|decontam"] = decontam.get_contaminants(rank, df_rank.index) - # Add a single column, concatenating ("|") references sources - df_rank["col|references"] = "" + # Add a column for each Annotation source for desc, ref in references.items(): - # Check if taxids are in the lineage of the reference - bool_ref = table.observations(rank).isin(ref.ids) - df_rank["col|references"] = df_rank["col|references"] + np.where(bool_ref, desc + " | ", "") - - # Add a column for each Contaminant source - for desc, cont in contaminants.items(): - #print(table.observations(rank).isin(cont.ids)) - #print(table.observations(rank).isin(cont.lineage)) - - df_rank["col|" + desc] = table.observations(rank).map(lambda x: cont.get_refs_count(x, direct=True)).to_list() - #df_rank["col|" + desc] = table.observations(rank).isin(cont.ids) + df_rank["col|" + desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)).to_list() # Add a column for each Control source if controls: @@ -170,7 +155,7 @@ def generate_cds_bars(table): return ColumnDataSource(df_bars) -def generate_cds_samples(table, references, contaminants, controls, decontam): +def generate_cds_samples(table, references, controls, decontam): # index unique sample-ids # aux| auxiliary values (not plotted) # cnt| count values to be copied/traansformed to bars @@ -189,7 +174,7 @@ def generate_cds_samples(table, references, contaminants, controls, decontam): df_samples["cnt|" + rank + "|assigned"] = table.data[rank].sum(axis=1) # Add counts specific to sources - source_list = [references.items(), contaminants.items()] + source_list = [references.items()] if controls: source_list.append(controls.items()) @@ -385,7 +370,7 @@ def generate_dict_topobs(table, top_obs_bars): return dict_top_taxa -def generate_dict_refs(table, contaminants, references): +def generate_dict_refs(table, references): # dict with information about sources and references # references can be repeated among descriptions, sources and taxids # {taxid: {source: {desc: [refs]}} @@ -396,18 +381,17 @@ def generate_dict_refs(table, contaminants, references): used_ids.update(table.observations(rank)) for i in used_ids: - for source in [contaminants.items(), references.items()]: - for sname, s in source: - for ref, descs in s.get_refs_desc(i, direct=True).items(): - for desc in descs: - # Only add items if they have a reference to it - if i not in d_refs: - d_refs[i] = {} - if sname not in d_refs[i]: - d_refs[i][sname] = {} - if desc not in d_refs[i][sname]: - d_refs[i][sname][desc] = [] - d_refs[i][sname][desc].append(ref) + for sname, s in references.items(): + for ref, descs in s.get_refs_desc(i, direct=True).items(): + for desc in descs: + # Only add items if they have a reference to it + if i not in d_refs: + d_refs[i] = {} + if sname not in d_refs[i]: + d_refs[i][sname] = {} + if desc not in d_refs[i][sname]: + d_refs[i][sname][desc] = [] + d_refs[i][sname][desc].append(ref) print_df(d_refs, "d_refs -> dict_d_refs") return d_refs diff --git a/grimer/grimer.py b/grimer/grimer.py index 4570f9a..ed960db 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -177,12 +177,11 @@ def main(): # Sources of contamination/references/controls print_log("- Parsing sources (contamination/references/controls)") + references = {} if args.tax == "ncbi": - contaminants, references = parse_sources(cfg, tax, table.ranks()) - else: - contaminants, references = [{}, {}] + references = parse_references(cfg, tax, table.ranks()) - controls, control_samples = parse_controls(cfg, tax, table) + controls, control_samples = parse_controls(cfg, table) print_log("") # Run and load decontam results @@ -224,11 +223,11 @@ def main(): # _p_ # df: index (unique observations), col|..., tax|..., aux|ref # this cds an exeption and contains data to plot (col|) and auxiliary data (tax|) - cds_p_obstable = generate_cds_obstable(table, tax, contaminants, references, controls, control_samples, decontam) + cds_p_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam) # df: index (unique sample-ids), aux|..., bar|..., tax|... cds_p_samplebars = generate_cds_bars(table) # stacked: - cds_p_contaminants = generate_cds_plot_contaminants(table, tax, contaminants) + cds_p_contaminants = generate_cds_plot_contaminants(table, tax, references) # matrix: index (unique sample-ids), concentrations, controls, counts cds_p_decontam = generate_cds_plot_decontam(decontam) if decontam else None # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} @@ -240,7 +239,7 @@ def main(): # matrix: index (unique sample-ids), md0, md1, ..., md(max_metadata_cols) -> (metadata field, metadata values) cds_p_metadata = generate_cds_plot_metadata(metadata, max_metadata_cols) if metadata else None # stacked: index (repeated observations), rank, annot - cds_p_annotations = generate_cds_annotations(table, contaminants, references, controls, decontam) + cds_p_annotations = generate_cds_annotations(table, references, controls, decontam) # empty matrix {"x": [], "y": [], "c": []} cds_p_dendro_x, cds_p_dendro_y = generate_cds_plot_dendro() if not args.skip_dendrogram else [None, None] # stacked: index (repeated observations), other observation, rank, rho, pval, pval_corr @@ -252,7 +251,7 @@ def main(): # matrix: index (unique sample-ids), columns (unique observations) -> raw counts cds_d_sampleobs = generate_cds_sampleobs(table) # df: index (unique sample-ids), aux|..., cnt|..., - cds_d_samples = generate_cds_samples(table, references, contaminants, controls, decontam) + cds_d_samples = generate_cds_samples(table, references, controls, decontam) # matrix: index (unique sample-ids) x columns (metadata fields) -> metadata values cds_d_metadata = generate_cds_metadata(metadata) if metadata else None # {taxid: (contam_y1, contam_y2, non_contam_y, pval)} @@ -268,7 +267,7 @@ def main(): # {rank: [taxid1,taxid2, ..., taxid(top_obs_bars)]} dict_d_topobs = generate_dict_topobs(table, args.top_obs_bars) # {taxid: {source: {desc: [refs]}} - dict_d_refs = generate_dict_refs(table, contaminants, references) + dict_d_refs = generate_dict_refs(table, references) ############ PLOT ELEMENTS (Figures, Widgets, ...) ############ "fig": main figure @@ -278,7 +277,7 @@ def main(): # obstable ele["obstable"] = {} - ele["obstable"]["fig"], ele["obstable"]["widgets_filter"] = plot_obstable(cds_p_obstable, table.ranks(), contaminants.keys(), controls.keys()) + ele["obstable"]["fig"], ele["obstable"]["widgets_filter"] = plot_obstable(cds_p_obstable, table.ranks(), references.keys(), controls.keys()) ele["obstable"]["wid"] = plot_obstable_widgets(dict_d_taxname, max(cds_p_obstable.data["col|total_counts"])) # infopanel @@ -288,7 +287,7 @@ def main(): # contaminants ele["contaminants"] = {} ele["contaminants"]["fig"], ele["contaminants"]["filter"] = plot_contaminants(table, cds_p_contaminants, dict_d_taxname) - ele["contaminants"]["wid"] = plot_contaminants_widgets(contaminants) + ele["contaminants"]["wid"] = plot_contaminants_widgets(references) # mgnify ele["mgnify"] = {} @@ -310,13 +309,13 @@ def main(): # samplebars ele["samplebars"] = {} ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, max_total_count, table.ranks()) - ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, list(contaminants.keys()), list(references.keys()), list(controls.keys()), decontam) + ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, list(references.keys()), list(controls.keys()), decontam) # heatmap tools_heatmap = "hover,save,box_zoom,reset,crosshair,box_select" ele["heatmap"] = {} ele["heatmap"]["fig"] = plot_heatmap(table, cds_p_heatmap, tools_heatmap, args.transformation, dict_d_taxname) - ele["heatmap"]["wid"] = plot_heatmap_widgets(table.ranks(), args.linkage_methods, args.linkage_metrics, list(contaminants.keys()), list(references.keys()), list(controls.keys()), metadata, decontam) + ele["heatmap"]["wid"] = plot_heatmap_widgets(table.ranks(), args.linkage_methods, args.linkage_metrics, list(references.keys()), list(controls.keys()), metadata, decontam) # metadata (heatmap) ele["metadata"] = {} diff --git a/grimer/plots.py b/grimer/plots.py index b6e5d5f..855cdfb 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -221,12 +221,11 @@ def plot_obsbars_widgets(ranks, metadata, dict_d_topobs, dict_d_taxname, top_obs "help_button": help_button(title="Observation bars", text=help_text)} -def plot_samplebars_widgets(ranks, metadata, contaminant_names, reference_names, control_names, decontam): +def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, decontam): annotbar_rank_select = Select(title="Annotate bars at rank:", value=ranks[0], options=[r for r in ranks]) annotbar_options = {} annotbar_options["Default"] = ["assigned"] - annotbar_options["Contaminant"] = [c for c in contaminant_names] annotbar_options["References"] = [r for r in reference_names] annotbar_options["Controls"] = [c for c in control_names] if decontam: @@ -301,7 +300,6 @@ def plot_obstable(cds_p_obstable, ranks, contaminant_names, control_names): for cont_name in contaminant_names: table_cols.append(TableColumn(field="col|" + cont_name, title=cont_name, default_sort="descending")) - table_cols.append(TableColumn(field="col|references", title="References", default_sort="descending")) if "col|decontam" in cds_p_obstable.data: table_cols.append(TableColumn(field="col|decontam", title="DECONTAM", default_sort="descending")) @@ -477,8 +475,8 @@ def plot_contaminants(table, cds_p_contaminants, dict_d_taxname): return contaminants_fig, contaminants_filter -def plot_contaminants_widgets(contaminants): - contaminant_select = Select(value=list(contaminants.keys())[0], width=200, options=list(contaminants.keys())) +def plot_contaminants_widgets(references): + contaminant_select = Select(value=list(references.keys())[0], width=200, options=list(references.keys())) help_text = """ contaminants explained """ @@ -612,14 +610,13 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax return heatmap -def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, contaminant_names, reference_names, controls_names, metadata, decontam): +def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_names, controls_names, metadata, decontam): rank_select = Select(title="Taxonomic rank:", value=ranks[0], options=ranks) x_sort_options = {} x_sort_options["Clustering Metric"] = [("metric|" + lm, lm) for lm in linkage_metrics] x_sort_options["Default order"] = [("none", "none"), ("counts", "counts"), ("observations", "observations")] - x_sort_options["Sort by Contaminants"] = [("annot|" + c, c) for c in contaminant_names] x_sort_options["Sort by References"] = [("annot|" + r, r) for r in reference_names] if controls_names: x_sort_options["Sort by Controls"] = [("annot|" + c, c) for c in controls_names] diff --git a/grimer/source.py b/grimer/source.py index 333a4f9..c4615f2 100644 --- a/grimer/source.py +++ b/grimer/source.py @@ -11,7 +11,7 @@ def __init__(self, file: str=None, ids: list=[]): self.parse(file) elif ids: for i in ids: - self.add(i) + self.add(i, "", "") def __repr__(self): args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] diff --git a/grimer/utils.py b/grimer/utils.py index 55a86ef..a25e107 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -44,13 +44,17 @@ def parse_input_table(input_file, unassigned_header, transpose, min_frequency, m total = table_df.sum(axis=1) # unique unassigned/unclassified for table + # Separate unassigned counts column from main data frame unassigned = pd.Series(0, index=table_df.index) if unassigned_header: for header in unassigned_header: if header in table_df.columns: - # Separate unassigned counts column from main data frame - # Sum in case there are several equally named headers - unassigned = unassigned + table_df[header].sum(axis=1) + if isinstance(table_df[header], pd.DataFrame): + # Sum in case there are several equally named headers + unassigned += table_df[header].sum(axis=1) + else: + # return a pd.Series + unassigned += table_df[header] table_df.drop(columns=header, inplace=True) else: print_log("'" + header + "' header not found") @@ -503,60 +507,44 @@ def include_scripts(scripts): return template -def parse_sources(cfg, tax, ranks): - contaminants = {} - references = {} - for desc, sf in cfg["sources"]["contaminants"].items(): - contaminants[desc] = Source(file=sf) - if tax: - # Update taxids / get taxid from name - contaminants[desc].update_taxids(update_tax_nodes(contaminants[desc].ids, tax)) - for i in list(contaminants[desc].ids.keys()): - # lineage of all children nodes (without itself) - for lin in map(lambda txid: tax.lineage(txid, root_node=i), tax.leaves(i)): - for l in lin[1:]: - contaminants[desc].add_child(l, i) - # lineage of all parent nodes (without itself) - for l in tax.lineage(i)[:-1]: - contaminants[desc].add_parent(l, i) +def parse_references(cfg, tax, ranks): + annotations = {} - for desc, sf in cfg["sources"]["references"].items(): - references[desc] = Source(file=sf) - # Update lineage and refs based on given taxonomy + for desc, sf in cfg["annotations"].items(): + annotations[desc] = Source(file=sf) if tax: # Update taxids / get taxid from name - references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) - for i in list(references[desc].ids.keys()): + annotations[desc].update_taxids(update_tax_nodes(annotations[desc].ids, tax)) + for i in list(annotations[desc].ids.keys()): # lineage of all children nodes (without itself) for lin in map(lambda txid: tax.lineage(txid, root_node=i), tax.leaves(i)): for l in lin[1:]: - references[desc].add_child(l, i) + annotations[desc].add_child(l, i) # lineage of all parent nodes (without itself) for l in tax.lineage(i)[:-1]: - references[desc].add_parent(l, i) + annotations[desc].add_parent(l, i) - return contaminants, references + return annotations -def parse_controls(cfg, tax, table): +def parse_controls(cfg, table): controls = {} control_samples = {} - if "samples" in cfg: - if "controls" in cfg["samples"]: - for desc, cf in cfg["samples"]["controls"].items(): - with open(cf, "r") as file: - samples = file.read().splitlines() - control_table = table.get_subtable(table.ranks()[-1], samples=samples) - controls[desc] = Source(ids=control_table.columns.to_list()) - control_samples[desc] = control_table.index.to_list() - - if tax: - ids = controls[desc].ids - # lineage of all children nodes - for i in ids: - for lin in map(lambda txid: tax.lineage(txid, root_node=i), tax.leaves(i)): - controls[desc].update_lineage(lin) + if "controls" in cfg: + for desc, cf in cfg["controls"].items(): + with open(cf, "r") as file: + samples = file.read().splitlines() + obs = set() + valid_samples = set() + for rank in table.ranks(): + # Retrieve sub-table for every rank and add to the source + control_table = table.get_subtable(rank, samples=samples) + obs.update(control_table.columns.to_list()) + valid_samples.update(control_table.index.to_list()) + + controls[desc] = Source(ids=obs) + control_samples[desc] = list(valid_samples) return controls, control_samples diff --git a/sources/contaminants/cc_eukaryota.yml b/sources/contaminants/cc_eukaryota.yml deleted file mode 100644 index 854d5f9..0000000 --- a/sources/contaminants/cc_eukaryota.yml +++ /dev/null @@ -1,4 +0,0 @@ -"Common Eukaryotic contaminants": - "PRJNA168": - url: "https://www.ncbi.nlm.nih.gov/genome/guide/human/" - ids: [9606] diff --git a/sources/contaminants/cc_viruses.yml b/sources/contaminants/cc_viruses.yml deleted file mode 100644 index 95fae88..0000000 --- a/sources/contaminants/cc_viruses.yml +++ /dev/null @@ -1,10 +0,0 @@ -"Common Viral contaminants": - "2019 Asplund, M. et al.": - url: "http://doi.org/10.1016/j.cmi.2019.04.028" - ids: [12071, 742919, 11103, 31647, 12461, 10298, 10376, 10359, 11676, 129951, 10583, 31552, 10798, 11908, 585044, 518981, 1225745, 11620, 1891767, 493803, 11033, 159150, 35306, 68887, 11870, 11958, 11861, 11946, 11864, 363745, 363020, 242521, 11866, 11960, 31668, 31669, 31670, 11867, 11955, 11874, 11876, 11878, 11885, 36381, 11886, 11888, 269447, 269448, 11950, 11948, 1332312, 354090, 11884, 1352534, 1395610, 1395611, 1395612, 1395613, 1395614, 1395615, 1395616, 1395617, 1395618, 1395619, 1395620, 1341019, 11801, 11809, 1511763, 1394983, 697906, 1072204, 1148801, 1574422, 12104, 763552, 10264, 85708, 759804, 28344, 85506, 33747, 10345, 285986, 220638, 1154691, 185638, 1169627, 1045778, 185636, 72201, 345198, 176652, 1301280, 68347, 1618248, 1618254, 10288, 198112, 1454023, 1454024, 1454025, 1278278, 1278246, 1278252, 1278247, 1278248, 1278249, 1278250, 1278251, 399781, 1278255, 346932, 1278261, 1278263, 1278265, 1474867, 1379694, 1521385, 1521387, 1521389, 938081, 938082, 880162, 251749, 455370, 169864, 1379788, 1608440, 642253, 642255, 1224510, 1592207, 1592212, 1592083, 1592085, 1592086, 1592088, 1592093, 1592095, 1592096, 1592081, 1843761, 1519405, 1557033, 1608451, 664785, 1435438, 1170653, 40979, 12235, 12138, 11987, 51680, 12056, 146500, 554168, 212035, 1269028, 693272, 1420594, 1094892, 1128140, 1235314, 1128143, 1128151, 1128131, 1450746, 1461100, 181522, 1424633, 1010698, 1299317, 1450749, 1416631, 1128422, 1034806, 1592112, 1592113, 1592127, 938080, 1074214, 1519385, 1519387, 1519389, 1519390, 1519395, 1519396, 1519397, 186617, 1262072, 1407671, 743583, 340016, 745107, 745102, 745100, 1416009, 1187128, 889876, 760732, 1243183, 1229760, 1481186, 1505225, 1560342, 233894, 115987, 260149, 227470, 926067, 1127514, 1296654, 294382, 1486657, 1084719, 10756, 1486662, 1285382, 1497851, 1127515, 145579, 263375, 764562, 1133292, 1133022, 242527, 260373, 279280, 644524, 242861, 1132026, 1357714, 1197951, 1327981, 1327976, 1327979, 1327992, 1328030, 1327990, 1327980, 1327972, 1327982, 1327995, 1327983, 1327970, 1327971, 756279, 1327977, 1327993, 1328029, 1327975, 1327974, 1327985, 756280, 756282, 1527524, 1540094, 1042123, 541865, 1567016, 765765, 1176422, 1327037, 1162295, 1141135, 1141136, 335924, 536444, 929832, 682650, 1137745, 536473, 749413, 1477406, 1048515, 1048516, 1048517, 1048520, 1048521, 1537091, 1264700, 1609634, 1455074, 414970, 10863, 10864, 1222338, 1147148, 1237364, 1414766, 1977402, 948870, 1524881, 10665, 10760, 1147094, 1429767, 925983, 925984, 1527519, 1527506, 1229753, 1540097, 1540098, 1054461, 1391223, 294631, 1325731, 908819, 1458858, 1458842, 90963, 1536592, 1527515, 551895, 1129191, 139872, 201847, 287412, 1262517, 754044, 1385658, 1176423, 889949, 446529, 1034128, 1056830, 1089119, 1486472, 1034111, 205879, 1340709, 1567475, 1472912, 1204539, 1399915, 1283076, 1283077, 1168479, 1168478, 440250, 400567, 994601, 1465639, 889956, 445700, 444862, 536454, 445688, 444861, 1229794, 1229793, 1229792, 1229791, 1229790, 1229789, 1229786, 1229787, 1229788, 1229784, 1229782, 376758, 1498188, 504501, 504553, 1235647, 1235648, 1235649, 1235650, 1235653, 1235654, 1235655, 1235656, 1235657, 877240, 754052, 1316739, 347326, 1235689, 31535, 757342, 582345, 1462581, 386793, 1204517, 347327, 1335230, 743813, 1348912, 1327964, 270673, 188350, 1541891, 169683, 998086, 1500757, 1458843, 1129146, 1279082, 1114179, 1548900, 1231048, 1548901, 1449437, 1548918, 1476390, 462590, 754048, 948071, 1481785, 1417599, 1131316, 691965, 136084, 754067, 1161935, 1173749, 1173761, 1173759, 1173762, 590739, 1406795, 1141134, 1204529, 1540099, 1168549, 866889, 1458859, 1458860, 1458861, 10761, 754060, 1524882, 1357423, 373126, 1150991, 1195080, 320843, 55510, 1434319, 320850, 369581, 537874, 1208587, 1566990, 10732, 490913, 1526550, 1340810, 756277, 753084, 753085, 756275, 1026955, 1340812, 238854, 555387, 754042, 444860, 981335, 469660, 215796, 1478972, 1385659, 926697, 336724, 278008, 1211417, 271647, 754075, 573173, 573174, 979525, 979534, 1529058, 1283071, 573176, 1589298, 1076759, 1461743, 1150989, 754058, 754051, 929835, 1414739, 754072, 1524880, 194802, 1168281, 1204514, 1188795, 331278] - "2015 Mukherjee, S. et al.": - url: "http://doi.org/10.1186/1944-3277-10-18" - ids: [10847] - "2015 Kjartansdóttir, K.R. et al.": - url: "https://doi.org/10.1073/pnas.1423756112" - ids: [322019] diff --git a/sources/references/human-related.yml b/sources/references/human-related.yml deleted file mode 100644 index 5106795..0000000 --- a/sources/references/human-related.yml +++ /dev/null @@ -1,13 +0,0 @@ -"Human-related bacterial isolates from BacDive": - "Limbs": - url: "https://bacdive.dsmz.de/search?search=taxid:{}" - ids: [326522, 82380, 1701, 131110, 29388, 84698, 729, 69968, 28264, 200476, 58172, 490, 1280, 1290, 41276, 220685, 28449, 644, 1660, 147645, 1351, 90367, 391, 1717, 1720, 1314, 646, 29391, 732, 1365628, 90245, 495, 192066, 753, 1979962, 82633, 53363, 539, 37637, 37329, 755171, 29466, 291112, 614, 28090, 1402, 217204, 1509, 326523, 2014, 1303, 28038, 676, 105219, 13076, 66228, 504, 28189, 752, 108980, 1747, 1282, 1504, 33028, 303, 672, 28035, 1286, 485, 1311, 28188, 28132, 1328, 1506, 652, 29380, 760, 46124, 1379, 755172, 193461, 158822, 479, 68892, 479117, 33889, 670, 420404, 1305, 1697053, 71254, 310300, 47920, 669, 1245, 38289, 36740, 354351, 48296, 29318, 192, 38313, 180332, 135487, 33007, 287, 754, 29317, 1648, 1713, 1352, 550, 53437, 2054, 38284, 1667, 511, 1015, 40091, 59561, 411577, 587, 370622, 206506, 37326, 90239, 161902, 137732, 52132, 34105, 180588, 33968, 386414, 283734, 1891233, 478, 156979, 28125, 1529, 306, 123899, 220687, 620903, 1239307, 1348, 316, 28091, 178214, 84112, 44737, 487, 1536, 1273, 24, 630, 1034, 322095, 488730, 70348, 650, 43765, 43770, 39791, 115545, 150055, 411570, 196, 131111, 472, 38301, 51671, 292, 146827, 1785, 1977869, 40542, 29432, 28450, 1890675, 47312, 38875, 1710, 739, 47917, 33010, 1292, 169292, 158877, 1781, 400946, 501496, 488, 239, 361500, 470, 1430326, 29354, 82347, 65058, 714, 521520, 38303, 1513, 502790, 747, 1141657, 38304] - "Ear": - url: "https://bacdive.dsmz.de/search?search=taxid:{}" - ids: [1747, 2702, 1869190, 32002, 85698, 131111, 29388, 28037, 44750, 51671, 1353, 28264, 545, 292, 89093, 1872515, 1280, 511, 29379, 68766, 59561, 29321, 480, 1311, 285091, 727, 199591, 43263, 1313, 739, 760, 1661, 52769, 1421, 1314, 156979, 35703, 1898, 585, 87883, 90245, 123899, 306, 1895474, 670, 47770, 319939, 184870, 134375, 72557, 753, 663, 316, 1343, 217203, 267212, 678, 53364, 1014, 1776741, 93220, 1639, 666, 38313, 1652, 105219, 38287, 293, 33007, 287] - "Eye": - url: "https://bacdive.dsmz.de/search?search=taxid:{}" - ids: [40216, 28037, 1280, 490, 147645, 1351, 90367, 1824, 813, 1720, 38290, 29391, 732, 192066, 616, 161879, 753, 1304, 1655, 539, 37329, 28172, 161890, 90241, 504, 752, 253, 457921, 1871047, 1309, 154288, 280147, 485, 760, 46124, 1931, 1379, 29394, 1671023, 68892, 479, 1396, 1544416, 2035, 420404, 735, 47846, 666, 571, 2055, 1401, 1270, 34062, 545, 38284, 247, 498, 40091, 59561, 370622, 37326, 727, 945844, 1313, 180588, 1685, 1671022, 478, 1302, 134375, 477, 726, 47478, 197575, 207340, 38287, 650, 756689, 43765, 69392, 723, 72556, 187491, 472, 51671, 2047, 1177728, 46125, 29432, 480, 47312, 739, 134533, 740, 37330, 488, 1544413, 239, 483, 29354, 41202, 38304] - "Nose": - url: "https://bacdive.dsmz.de/search?search=taxid:{}" - ids: [59823, 72556, 131111, 1282, 28264, 38284, 1280, 520, 43990, 615, 727, 1328, 1313, 90367, 760, 181487, 29394, 478, 732, 40324, 33889, 306, 39950, 1304, 1673725, 65058, 74319, 1591, 90241, 105219, 504, 286802, 195105, 574] diff --git a/sources/references/skin.yml b/sources/references/skin.yml deleted file mode 100644 index ccd9467..0000000 --- a/sources/references/skin.yml +++ /dev/null @@ -1,14 +0,0 @@ -"Human-related bacterial isolates from BacDive": - "Skin/Nail/Hair": - url: "https://bacdive.dsmz.de/search?search=taxid:{}" - ids: [1747, 106654, 1986155, 59823, 1869190, 1270, 71999, 1283, 1276, 131110, 1648, 1656, 472, 1352, 34062, 729, 29388, 2047, 28264, 1314, 1280, 1290, 672, 59561, 1780, 33918, 37326, 29432, 1286, 1891644, 74703, 90367, 1931, 33010, 1720, 1965292, 181487, 169292, 38290, 29506, 1622, 281920, 1292, 1781, 861, 1698, 1260, 2035, 202789, 521392, 470, 663, 29382, 1659, 1288, 37923, 1655, 45254, 1753, 1261, 38289, 36740, 1273, 1347368, 33034, 1347369, 1282, 66228, 132933, 43765, 287] -"Top organisms form the human skin microbiome" : - "Bacteria": - url: "https://doi.org/10.1038/nrmicro.2017.157" - ids: [257758, 225324, 169292, 161879, 146827, 43765, 38304, 38287, 38286, 29466, 29388, 28037, 1747, 1305, 1303, 1290, 1282, 1270] - "Eukarya": - url: "https://doi.org/10.1038/nrmicro.2017.157" - ids: [2510778, 1047171, 379413, 119676, 117179, 76777, 76775, 76773, 44058, 41880, 36894, 34391, 31312, 5480, 5068, 3074, 2762] - "Viruses": - url: "https://doi.org/10.1038/nrmicro.2017.157" - ids: [185639, 746832, 10566, 493803, 10279, 746830, 746831, 46771] From 4478ea84bab96e96cbe21a225546dd33a71dac0f Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 29 Sep 2021 17:37:07 +0200 Subject: [PATCH 06/50] proper name data structs references --- config/default.yaml | 2 +- grimer/callbacks.py | 30 ++++++++++----------- grimer/cds.py | 14 +++++----- grimer/grimer.py | 14 +++++----- grimer/layout.py | 2 +- grimer/plots.py | 66 ++++++++++++++++++++++----------------------- grimer/utils.py | 16 +++++------ 7 files changed, 72 insertions(+), 72 deletions(-) diff --git a/config/default.yaml b/config/default.yaml index 676507c..cd4bd40 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -1,4 +1,4 @@ -annotations: +references: "Contaminants": "files/contaminants.yml" "Human-related": "files/human-related.yml" diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 7eb5a74..114fa42 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -10,7 +10,7 @@ def link_obstable_samplebars(ele, cds_p_decontam, cds_p_decontam_models, cds_d_decontam, - cds_p_contaminants, + cds_p_references, active_ranks, min_obs_perc, max_total_count, @@ -317,37 +317,37 @@ def link_obstable_samplebars(ele, cds_p_mgnify.change.emit(); ''') - contaminants_callback = CustomJS( - args=dict(contaminants_fig=ele["contaminants"]["fig"], - contaminants_filter=ele["contaminants"]["filter"], - contaminant_select=ele["contaminants"]["wid"]["contaminant_select"], + references_callback = CustomJS( + args=dict(references_fig=ele["references"]["fig"], + references_filter=ele["references"]["filter"], + references_select=ele["references"]["wid"]["references_select"], cds_p_obstable=cds_p_obstable, - cds_p_contaminants=cds_p_contaminants, + cds_p_references=cds_p_references, active_ranks=active_ranks), code=''' - console.log("contaminants_callback"); + console.log("references_callback"); // selected row const row = cds_p_obstable.selected.indices[0]; const indices = []; if (row!=undefined){ - for(let i = 0; i < cds_p_contaminants.length; i++){ + for(let i = 0; i < cds_p_references.length; i++){ // for each rank for(let r = 0; r < active_ranks.length; r++){ // get taxid of the rank let rank_obs = cds_p_obstable.data["tax|"+active_ranks[r]][row]; - if(cds_p_contaminants.data["obs"][i]==rank_obs && - cds_p_contaminants.data["rank"][i]==active_ranks[r] && - cds_p_contaminants.data["annot"][i]==contaminant_select.value){ + if(cds_p_references.data["obs"][i]==rank_obs && + cds_p_references.data["rank"][i]==active_ranks[r] && + cds_p_references.data["ref"][i]==references_select.value){ indices.push(i); } } } } - contaminants_filter.indices = indices; - cds_p_contaminants.change.emit(); + references_filter.indices = indices; + cds_p_references.change.emit(); ''') - obstable_callbacks = [plot_obs_callback, change_text_legend_obs_callback, sort_groupby_callback, load_infopanel, contaminants_callback] + obstable_callbacks = [plot_obs_callback, change_text_legend_obs_callback, sort_groupby_callback, load_infopanel, references_callback] if cds_p_decontam: obstable_callbacks.append(decontam_callback) if cds_p_mgnify: @@ -362,7 +362,7 @@ def link_obstable_samplebars(ele, ele["samplebars"]["wid"]["y1_select"].js_on_change('value', bar_select_callback, change_y_counts_label_callback, sort_groupby_callback) ele["samplebars"]["wid"]["y2_select"].js_on_change('value', plot_obs_callback, change_y_obs_label_callback, sort_groupby_callback) ele["mgnify"]["wid"]["biome_spinner"].js_on_change('value', mgnify_callback) - ele["contaminants"]["wid"]["contaminant_select"].js_on_change('value', contaminants_callback) + ele["references"]["wid"]["references_select"].js_on_change('value', references_callback) def link_heatmap_widgets(ele, cds_d_samples, diff --git a/grimer/cds.py b/grimer/cds.py index a71a2c4..4c0de94 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -21,10 +21,10 @@ def generate_dict_taxname(tax, taxids): return id_name -def generate_cds_plot_contaminants(table, tax, references): - # Stacked list of contaminants for each observation +def generate_cds_plot_references(table, tax, references): + # Stacked list of references, accounting for its lineage matches # index -> observations (repeated) - # columns -> "rank", "annot", "direct", "child", "parent" + # columns -> "rank", "ref", "direct", "child", "parent" clist = [] for rank in table.ranks(): for obs in table.observations(rank): @@ -35,11 +35,11 @@ def generate_cds_plot_contaminants(table, tax, references): if direct + child + parent > 0: clist.append([obs, rank, desc, direct, child, parent]) - df_contaminants = pd.DataFrame(clist, columns=["obs", "rank", "annot", "direct", "child", "parent"]) - df_contaminants.set_index('obs', inplace=True) + df_references = pd.DataFrame(clist, columns=["obs", "rank", "ref", "direct", "child", "parent"]) + df_references.set_index('obs', inplace=True) - print_df(df_contaminants, "df_contaminants -> cds_p_contaminants") - return ColumnDataSource(df_contaminants) + print_df(df_references, "df_references -> cds_p_references") + return ColumnDataSource(df_references) def generate_cds_annotations(table, references, controls, decontam): # Stacked matrix of true annotations (omit false) diff --git a/grimer/grimer.py b/grimer/grimer.py index ed960db..7e55979 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -176,7 +176,7 @@ def main(): metadata = None # Sources of contamination/references/controls - print_log("- Parsing sources (contamination/references/controls)") + print_log("- Parsing references and controls") references = {} if args.tax == "ncbi": references = parse_references(cfg, tax, table.ranks()) @@ -227,7 +227,7 @@ def main(): # df: index (unique sample-ids), aux|..., bar|..., tax|... cds_p_samplebars = generate_cds_bars(table) # stacked: - cds_p_contaminants = generate_cds_plot_contaminants(table, tax, references) + cds_p_references = generate_cds_plot_references(table, tax, references) # matrix: index (unique sample-ids), concentrations, controls, counts cds_p_decontam = generate_cds_plot_decontam(decontam) if decontam else None # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} @@ -284,10 +284,10 @@ def main(): ele["infopanel"] = {} ele["infopanel"]["textarea"] = plot_infopanel() - # contaminants - ele["contaminants"] = {} - ele["contaminants"]["fig"], ele["contaminants"]["filter"] = plot_contaminants(table, cds_p_contaminants, dict_d_taxname) - ele["contaminants"]["wid"] = plot_contaminants_widgets(references) + # references + ele["references"] = {} + ele["references"]["fig"], ele["references"]["filter"] = plot_references(table, cds_p_references, dict_d_taxname) + ele["references"]["wid"] = plot_references_widgets(references) # mgnify ele["mgnify"] = {} @@ -362,7 +362,7 @@ def main(): cds_p_decontam, cds_p_decontam_models, cds_d_decontam, - cds_p_contaminants, + cds_p_references, table.ranks(), min_obs_perc, max_total_count, diff --git a/grimer/layout.py b/grimer/layout.py index 587d1d4..31ae4a9 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -19,7 +19,7 @@ def make_layout(ele, version, logo_path, title): width=top_panel_width_sides) info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] - info_tabs.append(Panel(child=column(ele["contaminants"]["fig"], row(ele["contaminants"]["wid"]["contaminant_select"], ele["contaminants"]["wid"]["help_button"])), title="CC")) + info_tabs.append(Panel(child=column(ele["references"]["fig"], row(ele["references"]["wid"]["references_select"], ele["references"]["wid"]["help_button"])), title="Refs.")) if ele["mgnify"]["fig"]: info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], row(ele["mgnify"]["wid"]["biome_spinner"], ele["mgnify"]["wid"]["help_button"])), title="MGNify")) diff --git a/grimer/plots.py b/grimer/plots.py index 855cdfb..a60c764 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -277,7 +277,7 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec "help_button": help_button(title="Sample bars", text=help_text)} -def plot_obstable(cds_p_obstable, ranks, contaminant_names, control_names): +def plot_obstable(cds_p_obstable, ranks, reference_names, control_names): # General filter for widgets widgets_filter = IndexFilter() @@ -297,8 +297,8 @@ def plot_obstable(cds_p_obstable, ranks, contaminant_names, control_names): for ctrl_name in control_names: table_cols.append(TableColumn(field="col|" + ctrl_name, title="(F) " + ctrl_name, default_sort="descending", formatter=NumberFormatter(format="0.00%"))) - for cont_name in contaminant_names: - table_cols.append(TableColumn(field="col|" + cont_name, title=cont_name, default_sort="descending")) + for ref_name in reference_names: + table_cols.append(TableColumn(field="col|" + ref_name, title=ref_name, default_sort="descending")) if "col|decontam" in cds_p_obstable.data: table_cols.append(TableColumn(field="col|decontam", title="DECONTAM", default_sort="descending")) @@ -428,8 +428,8 @@ def plot_decontam_widgets(): "help_button": help_button(title="DECONTAM", text=help_text, align="start")} -def plot_contaminants(table, cds_p_contaminants, dict_d_taxname): - contaminants_fig = figure(x_range=table.ranks(), height=150, width=300, +def plot_references(table, cds_p_references, dict_d_taxname): + references_fig = figure(x_range=table.ranks(), height=150, width=300, tools="save,reset") # Need to pass dict_d_taxname inside a one column data @@ -438,7 +438,7 @@ def plot_contaminants(table, cds_p_contaminants, dict_d_taxname): code="return dict_d_taxname.data.dict_d_taxname[0][value]; // value holds the @taxid" ) # Add custom tooltip for heatmap (taxid->name) - contaminants_fig.add_tools(HoverTool( + references_fig.add_tools(HoverTool( tooltips=[ ('Observation', '@obs{custom}'), ('# reported (directly)', '@direct'), @@ -450,38 +450,38 @@ def plot_contaminants(table, cds_p_contaminants, dict_d_taxname): formatters={"@obs": taxid_name_custom} )) - contaminants_filter = IndexFilter(indices=[]) - cds_view_contaminants = CDSView(source=cds_p_contaminants, filters=[contaminants_filter]) + references_filter = IndexFilter(indices=[]) + cds_view_references = CDSView(source=cds_p_references, filters=[references_filter]) fixed_bar_options = ["direct", "child", "parent"] palette = ["red", "orange", "black"] - contaminants_fig.vbar_stack(fixed_bar_options, - x="rank", - width=1, - source=cds_p_contaminants, - view=cds_view_contaminants, - color=palette, - line_color=None, # to avoid printing small border for zeros - fill_alpha=[1, 0.3, 0.3]) - - contaminants_fig.xaxis.major_label_orientation = "vertical" - contaminants_fig.xgrid.grid_line_color = None - contaminants_fig.xaxis.minor_tick_line_color = None - contaminants_fig.yaxis.minor_tick_line_color = None - contaminants_fig.xaxis.major_tick_line_color = None - contaminants_fig.yaxis.major_tick_line_color = None - contaminants_fig.yaxis.axis_label = "# reported" - - return contaminants_fig, contaminants_filter - - -def plot_contaminants_widgets(references): - contaminant_select = Select(value=list(references.keys())[0], width=200, options=list(references.keys())) + references_fig.vbar_stack(fixed_bar_options, + x="rank", + width=1, + source=cds_p_references, + view=cds_view_references, + color=palette, + line_color=None, # to avoid printing small border for zeros + fill_alpha=[1, 0.3, 0.3]) + + references_fig.xaxis.major_label_orientation = "vertical" + references_fig.xgrid.grid_line_color = None + references_fig.xaxis.minor_tick_line_color = None + references_fig.yaxis.minor_tick_line_color = None + references_fig.xaxis.major_tick_line_color = None + references_fig.yaxis.major_tick_line_color = None + references_fig.yaxis.axis_label = "# reported" + + return references_fig, references_filter + + +def plot_references_widgets(references): + references_select = Select(value=list(references.keys())[0], width=200, options=list(references.keys())) help_text = """ -contaminants explained +references explained """ - return {"contaminant_select": contaminant_select, - "help_button": help_button(title="Common Contaminants", text=help_text, align="start")} + return {"references_select": references_select, + "help_button": help_button(title="References", text=help_text, align="start")} def plot_mgnify(cds_p_mgnify): diff --git a/grimer/utils.py b/grimer/utils.py index a25e107..922b044 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -508,23 +508,23 @@ def include_scripts(scripts): def parse_references(cfg, tax, ranks): - annotations = {} + references = {} - for desc, sf in cfg["annotations"].items(): - annotations[desc] = Source(file=sf) + for desc, sf in cfg["references"].items(): + references[desc] = Source(file=sf) if tax: # Update taxids / get taxid from name - annotations[desc].update_taxids(update_tax_nodes(annotations[desc].ids, tax)) - for i in list(annotations[desc].ids.keys()): + references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) + for i in list(references[desc].ids.keys()): # lineage of all children nodes (without itself) for lin in map(lambda txid: tax.lineage(txid, root_node=i), tax.leaves(i)): for l in lin[1:]: - annotations[desc].add_child(l, i) + references[desc].add_child(l, i) # lineage of all parent nodes (without itself) for l in tax.lineage(i)[:-1]: - annotations[desc].add_parent(l, i) + references[desc].add_parent(l, i) - return annotations + return references def parse_controls(cfg, table): From e9402b79e2708078ce7560c587d943fbee428603 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 30 Sep 2021 12:13:31 +0200 Subject: [PATCH 07/50] improved sizes, references --- grimer/cds.py | 2 +- grimer/grimer.py | 44 ++++++++------ grimer/layout.py | 36 +++++++----- grimer/plots.py | 94 +++++++++++++++++++----------- grimer/{source.py => reference.py} | 4 +- grimer/sourceold.py | 77 ------------------------ grimer/utils.py | 33 ++++++----- 7 files changed, 128 insertions(+), 162 deletions(-) rename grimer/{source.py => reference.py} (97%) delete mode 100644 grimer/sourceold.py diff --git a/grimer/cds.py b/grimer/cds.py index 4c0de94..f6d23be 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -22,7 +22,7 @@ def generate_dict_taxname(tax, taxids): def generate_cds_plot_references(table, tax, references): - # Stacked list of references, accounting for its lineage matches + # Stacked list of references, accounting for lineage matches # index -> observations (repeated) # columns -> "rank", "ref", "direct", "child", "parent" clist = [] diff --git a/grimer/grimer.py b/grimer/grimer.py index 7e55979..d289c85 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -152,12 +152,10 @@ def main(): replace_zero_value = table_df[table_df.gt(0)].min().min() / int(args.replace_zeros) except: replace_zero_value = float(args.replace_zeros) - - # Do not allow value 1 using log if replace_zero_value == 1 and args.transformation == "log": - replace_zero_value = 0.999999 + replace_zero_value = 0.999999 # Do not allow value 1 using log - # Parse Metadata + # Metadata max_metadata_cols = args.metadata_cols if args.metadata: print_log("- Parsing metadata") @@ -175,14 +173,19 @@ def main(): else: metadata = None - # Sources of contamination/references/controls - print_log("- Parsing references and controls") + # References (only possible with ncbi identifiers) references = {} - if args.tax == "ncbi": + if "references" in cfg and args.tax == "ncbi": + print_log("- Parsing references") references = parse_references(cfg, tax, table.ranks()) + print_log("") - controls, control_samples = parse_controls(cfg, table) - print_log("") + controls, control_samples = [{}, {}] + if "controls" in cfg: + print_log("- Parsing controls") + # Controls + controls, control_samples = parse_controls(cfg, table) + print_log("") # Run and load decontam results if args.decontam: @@ -219,14 +222,13 @@ def main(): ############ _d_ : data -> auxiliar containers to be used/shared among plots ############ usually by copying and/or transforming values into a _p_ container - # _p_ # df: index (unique observations), col|..., tax|..., aux|ref # this cds an exeption and contains data to plot (col|) and auxiliary data (tax|) cds_p_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam) # df: index (unique sample-ids), aux|..., bar|..., tax|... cds_p_samplebars = generate_cds_bars(table) - # stacked: + # stacked: index (repeated observations), rank, ref, direct, child, parent cds_p_references = generate_cds_plot_references(table, tax, references) # matrix: index (unique sample-ids), concentrations, controls, counts cds_p_decontam = generate_cds_plot_decontam(decontam) if decontam else None @@ -273,12 +275,18 @@ def main(): ############ "fig": main figure ############ "wid": widgets + # Layout and plot sizes + sizes = {} + sizes["overview_top_panel_height"] = 300 + sizes["overview_top_panel_width_left"] = 250 + sizes["overview_top_panel_width_right"] = 500 + ele = {} # obstable ele["obstable"] = {} - ele["obstable"]["fig"], ele["obstable"]["widgets_filter"] = plot_obstable(cds_p_obstable, table.ranks(), references.keys(), controls.keys()) - ele["obstable"]["wid"] = plot_obstable_widgets(dict_d_taxname, max(cds_p_obstable.data["col|total_counts"])) + ele["obstable"]["fig"], ele["obstable"]["widgets_filter"] = plot_obstable(sizes, cds_p_obstable, table.ranks(), references.keys(), controls.keys()) + ele["obstable"]["wid"] = plot_obstable_widgets(sizes, dict_d_taxname, max(cds_p_obstable.data["col|total_counts"])) # infopanel ele["infopanel"] = {} @@ -286,13 +294,13 @@ def main(): # references ele["references"] = {} - ele["references"]["fig"], ele["references"]["filter"] = plot_references(table, cds_p_references, dict_d_taxname) - ele["references"]["wid"] = plot_references_widgets(references) + ele["references"]["fig"], ele["references"]["filter"] = plot_references(sizes, table, cds_p_references, dict_d_taxname) + ele["references"]["wid"] = plot_references_widgets(sizes, references) # mgnify ele["mgnify"] = {} if cds_p_mgnify: - ele["mgnify"]["fig"], ele["mgnify"]["filter"] = plot_mgnify(cds_p_mgnify) + ele["mgnify"]["fig"], ele["mgnify"]["filter"] = plot_mgnify(sizes, cds_p_mgnify) else: ele["mgnify"]["fig"], ele["mgnify"]["filter"] = None, None ele["mgnify"]["wid"] = plot_mgnify_widgets() @@ -301,7 +309,7 @@ def main(): ele["decontam"] = {} ele["decontam"]["wid"] = {} if decontam: - ele["decontam"]["fig"] = plot_decontam(cds_p_decontam, cds_p_decontam_models, min_obs_perc) + ele["decontam"]["fig"] = plot_decontam(sizes, cds_p_decontam, cds_p_decontam_models, min_obs_perc) else: ele["decontam"]["fig"] = None ele["decontam"]["wid"] = plot_decontam_widgets() @@ -401,7 +409,7 @@ def main(): script_dir, _ = os.path.split(__file__) logo_path = os.path.join(script_dir, "img", "logo.png") - final_layout = make_layout(ele, version, logo_path, args.title) + final_layout = make_layout(ele, sizes, version, logo_path, args.title) template = include_scripts({os.path.join(script_dir, "js", "func.js"): "script", os.path.join(script_dir, "js", "popup.js"): "script", diff --git a/grimer/layout.py b/grimer/layout.py index 31ae4a9..30d4e6c 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -3,34 +3,42 @@ import base64 -def make_layout(ele, version, logo_path, title): +def make_layout(ele, sizes, version, logo_path, title): - top_panel_height = 200 - top_panel_width_sides = 300 - filterwidgets = column(row(ele["obstable"]["wid"]["frequency_spinner"], - ele["obstable"]["wid"]["counts_perc_avg_spinner"], - ele["obstable"]["wid"]["help_button"]), + filterwidgets = column(ele["obstable"]["wid"]["frequency_spinner"], + ele["obstable"]["wid"]["counts_perc_avg_spinner"], ele["obstable"]["wid"]["total_counts_spinner"], - ele["obstable"]["wid"]["name_multichoice"]) + ele["obstable"]["wid"]["name_multichoice"], + ele["obstable"]["wid"]["help_button"]) filterwidgetstabs = Tabs(tabs=[Panel(child=filterwidgets, title="Filter")], sizing_mode="fixed", - height=top_panel_height + 20, - width=top_panel_width_sides) + height=sizes["overview_top_panel_height"] + 20, + width=sizes["overview_top_panel_width_left"]) info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] - info_tabs.append(Panel(child=column(ele["references"]["fig"], row(ele["references"]["wid"]["references_select"], ele["references"]["wid"]["help_button"])), title="Refs.")) + info_tabs.append(Panel(child=column(ele["references"]["fig"], + row(ele["references"]["wid"]["references_select"], + ele["references"]["wid"]["help_button"]) + ), title="References")) if ele["mgnify"]["fig"]: - info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], row(ele["mgnify"]["wid"]["biome_spinner"], ele["mgnify"]["wid"]["help_button"])), title="MGNify")) + info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], + row(ele["mgnify"]["wid"]["biome_spinner"], + ele["mgnify"]["wid"]["help_button"]) + ), title="MGNify")) if ele["decontam"]["fig"]: - info_tabs.append(Panel(child=column(ele["decontam"]["fig"], row(ele["decontam"]["wid"]["pvalue_text"], ele["decontam"]["wid"]["pvalue_input"], ele["decontam"]["wid"]["help_button"])), title="DECONTAM")) + info_tabs.append(Panel(child=column(ele["decontam"]["fig"], + row(ele["decontam"]["wid"]["pvalue_text"], + ele["decontam"]["wid"]["pvalue_input"], + ele["decontam"]["wid"]["help_button"]) + ), title="DECONTAM")) infotabs = Tabs(tabs=info_tabs, sizing_mode="fixed", - height=top_panel_height + 20, - width=top_panel_width_sides) + height=sizes["overview_top_panel_height"] + 20, + width=sizes["overview_top_panel_width_right"]) row_obstable = row(filterwidgetstabs, ele["obstable"]["fig"], diff --git a/grimer/plots.py b/grimer/plots.py index a60c764..aaa5a5c 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -10,7 +10,7 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): # Bar plots has 3 main stacks: selection, others, unassigned - # stacks can be annotated with sources + # stacks can be annotated with references and controls samplebars_fig = figure(x_range=FactorRange(factors=cds_p_samplebars.data["aux|factors"]), y_range=Range1d(start=0, end=max_total_count), plot_height=400, @@ -264,7 +264,7 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec Bars can be annotated by taxonomic rank. The annotation will show the proportion of counts matching the selected annotation. -Raw counts (#) can be normalized (%). Observation counts (#) can be normalized (%) or log transformed (log10(#)) for better visualization. +Raw counts and observations (#) can be normalized (%) and/or log transformed (log10) for better visualization. """ return {"y1_select": y1_select, @@ -277,7 +277,7 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec "help_button": help_button(title="Sample bars", text=help_text)} -def plot_obstable(cds_p_obstable, ranks, reference_names, control_names): +def plot_obstable(sizes, cds_p_obstable, ranks, reference_names, control_names): # General filter for widgets widgets_filter = IndexFilter() @@ -303,7 +303,7 @@ def plot_obstable(cds_p_obstable, ranks, reference_names, control_names): if "col|decontam" in cds_p_obstable.data: table_cols.append(TableColumn(field="col|decontam", title="DECONTAM", default_sort="descending")) - datatable = DataTable(height=200, + datatable = DataTable(height=sizes["overview_top_panel_height"], sizing_mode="stretch_width", index_position=None, autosize_mode="fit_viewport", @@ -321,39 +321,42 @@ def plot_obstable(cds_p_obstable, ranks, reference_names, control_names): return obstable, widgets_filter -def plot_obstable_widgets(dict_d_taxname, max_count_rank): +def plot_obstable_widgets(sizes, dict_d_taxname, max_count_rank): # Filtering options - frequency_spinner = Spinner(title="Frequency", low=0, high=100, value=0, step=1, width=100, height=50) - counts_perc_avg_spinner = Spinner(title="Avg. counts/sample", low=0, high=100, value=0, step=0.1, width=100, height=50) - total_counts_spinner = Spinner(title="Total counts", low=1, high=max_count_rank, step=1, value=1, width=100, height=50) + spinner_width = sizes["overview_top_panel_width_left"] - 20 + frequency_spinner = Spinner(title="Frequency", low=0, high=100, value=0, step=1, width=spinner_width, height=50) + counts_perc_avg_spinner = Spinner(title="Avg. counts/sample", low=0, high=100, value=0, step=0.1, width=spinner_width, height=50) + total_counts_spinner = Spinner(title="Total counts", low=1, high=max_count_rank, step=1, value=1, width=spinner_width, height=50) # Create unique list of names with taxids for filtering. map to str and set to get unique unique_dict_d_taxname_tuples = set(zip(dict_d_taxname.keys(), map(str, dict_d_taxname.values()))) - name_multichoice = MultiChoice(title="Obs. name or identifier", options=list(unique_dict_d_taxname_tuples), sizing_mode="fixed", width=250, height=100) + name_multichoice = MultiChoice(title="Obs. name or id", + options=list(unique_dict_d_taxname_tuples), + sizing_mode="fixed", + width=sizes["overview_top_panel_width_left"] - 20, height=60) help_text = """ -Summary of observations among all samples. If taxonomy is provided, panels will show different taxonomic ranks. +Summary of observations among all samples. If taxonomy is provided, panels will show different taxonomic ranks. -Clicking on the entries will load further information of the observation in the other plots/panels. +Clicking on the entries will load further information of the observation in the other plots/panels. The table contain the following fixed columns: - **Name**: Taxonomic or given observation name -- **Frequency**: How often the observation is occurring among all samples +- **Frequency**: How often the observation is occurring among all samples - **Avg. counts/sample**: Averge percentage of the observation among all samples - **Total counts**: Sum of counts of this observation in all samples - **(F) Controls**: (F)requency for controls: how often the observation is occurring in the given control samples -- **CC Bacteria, CC Viruses, ...**: How many times the observation was reported as a Common Contaminant -- **References**: In which reference sets this observation occurs -- **DECONTAM**: Contamination results from DECONTAM method +- **References**: How many times the observation was reported in the references +- **DECONTAM**: Final contamination output from DECONTAM method -Widgets can filter entries of the table. "Obs. name or identifier" filters the lineage of the entries, if taxonomy is provided. With that is possible to, for example, filter a certain genus and the table will show only children species. +Widgets can filter entries of the table. "Obs. name or id" filters the lineage of the entries, if taxonomy is provided. With that is possible to, for example, filter a certain genus and the table will show only children species. """ return {"frequency_spinner": frequency_spinner, "counts_perc_avg_spinner": counts_perc_avg_spinner, "total_counts_spinner": total_counts_spinner, "name_multichoice": name_multichoice, - "help_button": help_button(title="Observation table", text=help_text)} + "help_button": help_button(title="Observation table", text=help_text, align="start")} def plot_infopanel(): @@ -362,10 +365,11 @@ def plot_infopanel(): disabled=False) -def plot_decontam(cds_p_decontam, cds_p_decontam_lines, min_obs_perc): +def plot_decontam(sizes, cds_p_decontam, cds_p_decontam_lines, min_obs_perc): decontam_fig = figure(x_axis_type="log", y_axis_type="log", - height=170, width=300, + height=sizes["overview_top_panel_height"] - 50, + width=sizes["overview_top_panel_width_right"], sizing_mode="stretch_width", tools="save") @@ -398,15 +402,14 @@ def plot_decontam(cds_p_decontam, cds_p_decontam_lines, min_obs_perc): decontam_fig.line(x="x", y="y_noncont", source=cds_p_decontam_lines, - color="black", line_dash="dashed") + color="black", + line_dash="dashed") decontam_fig.xaxis.axis_label = 'DNA Concentration/Total counts' decontam_fig.yaxis.axis_label = 'obs. counts' decontam_fig.y_range.start = min_obs_perc decontam_fig.y_range.end = 1 - - return decontam_fig @@ -428,9 +431,11 @@ def plot_decontam_widgets(): "help_button": help_button(title="DECONTAM", text=help_text, align="start")} -def plot_references(table, cds_p_references, dict_d_taxname): - references_fig = figure(x_range=table.ranks(), height=150, width=300, - tools="save,reset") +def plot_references(sizes, table, cds_p_references, dict_d_taxname): + references_fig = figure(x_range=table.ranks(), + height=sizes["overview_top_panel_height"] - 50, + width=sizes["overview_top_panel_width_right"], + tools="save,reset") # Need to pass dict_d_taxname inside a one column data taxid_name_custom = CustomJSHover( @@ -442,8 +447,8 @@ def plot_references(table, cds_p_references, dict_d_taxname): tooltips=[ ('Observation', '@obs{custom}'), ('# reported (directly)', '@direct'), - (' as children', '@child'), - (' as parent', '@parent'), + ('as child', '@child'), + ('as parent', '@parent'), ], mode="mouse", point_policy="follow_mouse", @@ -453,6 +458,8 @@ def plot_references(table, cds_p_references, dict_d_taxname): references_filter = IndexFilter(indices=[]) cds_view_references = CDSView(source=cds_p_references, filters=[references_filter]) + references_fig.add_layout(Legend(), 'above') + fixed_bar_options = ["direct", "child", "parent"] palette = ["red", "orange", "black"] references_fig.vbar_stack(fixed_bar_options, @@ -462,7 +469,10 @@ def plot_references(table, cds_p_references, dict_d_taxname): view=cds_view_references, color=palette, line_color=None, # to avoid printing small border for zeros - fill_alpha=[1, 0.3, 0.3]) + fill_alpha=[1, 0.3, 0.3], + legend_label=fixed_bar_options) + + references_fig.y_range.start = 0 references_fig.xaxis.major_label_orientation = "vertical" references_fig.xgrid.grid_line_color = None @@ -472,20 +482,35 @@ def plot_references(table, cds_p_references, dict_d_taxname): references_fig.yaxis.major_tick_line_color = None references_fig.yaxis.axis_label = "# reported" + references_fig.legend.margin = 0 + references_fig.legend.border_line_width = 0 + references_fig.legend.spacing = 0 + references_fig.legend.padding = 0 + references_fig.legend.orientation = "horizontal" + references_fig.legend.location = "bottom_right" + return references_fig, references_filter -def plot_references_widgets(references): - references_select = Select(value=list(references.keys())[0], width=200, options=list(references.keys())) +def plot_references_widgets(sizes, references): + references_select = Select(value=list(references.keys())[0], width=sizes["overview_top_panel_width_right"] - 70, options=list(references.keys())) help_text = """ -references explained +Plot of number of occurences of provided references for each observation and its lineage. + +**direct** counts represent direct matches with reference identifiers + +**child** counts accounts for the number of times a related parent on the lineage of the selected observation node was reported among references + +**parent** counts accounts for the number of times related children (not necessarily reported) on the lineage of the selected observation node was reported among references """ + return {"references_select": references_select, "help_button": help_button(title="References", text=help_text, align="start")} -def plot_mgnify(cds_p_mgnify): - mgnify_fig = figure(height=150, width=300, +def plot_mgnify(sizes, cds_p_mgnify): + mgnify_fig = figure(height=sizes["overview_top_panel_height"] - 50, + width=sizes["overview_top_panel_width_right"], tools="save,wheel_zoom,reset") mgnify_filter = IndexFilter(indices=[]) @@ -890,7 +915,7 @@ def plot_correlation_widgets(ranks, top_obs_corr): neg_slider = RangeSlider(start=-1, end=0, value=(-1, 0), step=.01, title="Negative correlation") pos_slider = RangeSlider(start=0, end=1, value=(0, 1), step=.01, title="Positive correlation") pval_spinner = Spinner(title="Corrected P-value", low=0, high=1, step=0.01, value=1, width=100, height=50) - + help_text = """ Spearman correlation coefficient with associated and corrected (Benjamini/Hochberg) p-values between the top """ + str(top_obs_corr) + """ most abundant observations. [spearmanr](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html) function from scipy is used. @@ -905,6 +930,7 @@ def plot_correlation_widgets(ranks, top_obs_corr): "pval_spinner": pval_spinner, "help_button": help_button(title="Correlation", text=help_text)} + def help_button(title: str="", text: str="", align: str="end"): hb = Button(width=32, height=32, label="?", align=align, button_type="warning") diff --git a/grimer/source.py b/grimer/reference.py similarity index 97% rename from grimer/source.py rename to grimer/reference.py index c4615f2..97e9abf 100644 --- a/grimer/source.py +++ b/grimer/reference.py @@ -1,7 +1,7 @@ import yaml -class Source: +class Reference: def __init__(self, file: str=None, ids: list=[]): self.ids = {} # {refid: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)}} self.children = {} # {child_id: set(refids)} @@ -15,7 +15,7 @@ def __init__(self, file: str=None, ids: list=[]): def __repr__(self): args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] - return 'Source({})'.format(', '.join(args)) + return 'Reference({})'.format(', '.join(args)) def add(self, i, ref: str=None, desc: str=None): if i not in self.ids: diff --git a/grimer/sourceold.py b/grimer/sourceold.py deleted file mode 100644 index e9d73e7..0000000 --- a/grimer/sourceold.py +++ /dev/null @@ -1,77 +0,0 @@ -import yaml - - -class SourceOld: - def __init__(self, file: str=None, ids: list=[]): - # Only leaf ids/nodes - self.ids = set() - self.lineage = set() - - # {id: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)} - self.refs = {} - - if file is not None: - self.parse(file) - elif ids: - self.ids.update(ids) - self.lineage.update(ids) - - def __repr__(self): - args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] - return 'Source({})'.format(', '.join(args)) - - def parse(self, file): - with open(file, 'r') as fh: - if file.endswith(".yml") or file.endswith(".yaml"): - src = yaml.safe_load(fh) - for desc, val in src.items(): - for ref, v in val.items(): - str_ids = list(map(str, v["ids"])) - self.ids.update(str_ids) - self.lineage.update(str_ids) - for i in str_ids: - self.add_refs_desc(i, (ref, v["url"]), desc) - else: - for line in fh: - main_id = line.rstrip() - self.ids.add(main_id) - self.lineage.add(main_id) - - def update_lineage(self, ids): - self.lineage.update(ids) - - def update_taxids(self, taxid_updated): - # Update taxonomy entries or convert names to taxid - for node, upd_node in taxid_updated.items(): - if upd_node is not None and upd_node != node: - print("Updated taxonomic node: " + node + " -> " + upd_node) - self.ids.discard(node) - self.ids.add(upd_node) - self.lineage.discard(node) - self.lineage.add(upd_node) - if node in self.refs: - self.refs[upd_node] = self.refs.pop(node) - - def add_refs_desc(self, i, ref, desc): - if i not in self.refs: - self.refs[i] = {} - if ref not in self.refs[i]: - self.refs[i][ref] = set() - if desc is not None: - self.refs[i][ref].add(desc) - - def get_refs_desc(self, i): - return self.refs[i] if i in self.refs else {} - - def get_refs(self, i): - return list(self.refs[i].keys()) if i in self.refs else () - - def get_refs_count(self, i): - return len(self.refs[i]) if i in self.refs else 0 - - def update_refs(self, taxid_parent_rank): - for taxid, parent_taxid in taxid_parent_rank.items(): - if parent_taxid is not None and taxid in self.refs: - for i in self.refs[taxid]: - for r in self.refs[taxid][i]: - self.add_refs_desc(parent_taxid, i, r) diff --git a/grimer/utils.py b/grimer/utils.py index 922b044..94af934 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -11,7 +11,7 @@ #Internal from grimer.decontam import Decontam from grimer.plots import make_color_palette -from grimer.source import Source +from grimer.reference import Reference #biom from biom import parse_table as parse_table_biom @@ -511,7 +511,7 @@ def parse_references(cfg, tax, ranks): references = {} for desc, sf in cfg["references"].items(): - references[desc] = Source(file=sf) + references[desc] = Reference(file=sf) if tax: # Update taxids / get taxid from name references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) @@ -531,23 +531,24 @@ def parse_controls(cfg, table): controls = {} control_samples = {} - if "controls" in cfg: - for desc, cf in cfg["controls"].items(): - with open(cf, "r") as file: - samples = file.read().splitlines() - obs = set() - valid_samples = set() - for rank in table.ranks(): - # Retrieve sub-table for every rank and add to the source - control_table = table.get_subtable(rank, samples=samples) - obs.update(control_table.columns.to_list()) - valid_samples.update(control_table.index.to_list()) - - controls[desc] = Source(ids=obs) - control_samples[desc] = list(valid_samples) + for desc, cf in cfg["controls"].items(): + with open(cf, "r") as file: + samples = file.read().splitlines() + obs = set() + valid_samples = set() + for rank in table.ranks(): + # Retrieve sub-table for every rank + control_table = table.get_subtable(rank, samples=samples) + obs.update(control_table.columns.to_list()) + valid_samples.update(control_table.index.to_list()) + + # Add control observations as a reference + controls[desc] = Reference(ids=obs) + control_samples[desc] = list(valid_samples) return controls, control_samples + def run_cmd(cmd, print_stderr: bool=False, exit_on_error: bool=True): errcode = 0 stdout = "" From bb5ac0781a71d46731184817cdfa50f8f4ed820a Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 30 Sep 2021 13:48:57 +0200 Subject: [PATCH 08/50] remove childen references --- grimer/cds.py | 9 ++++----- grimer/grimer.py | 2 +- grimer/plots.py | 9 +++------ grimer/reference.py | 15 +++------------ grimer/utils.py | 4 ---- 5 files changed, 11 insertions(+), 28 deletions(-) diff --git a/grimer/cds.py b/grimer/cds.py index f6d23be..9b3a792 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -24,18 +24,17 @@ def generate_dict_taxname(tax, taxids): def generate_cds_plot_references(table, tax, references): # Stacked list of references, accounting for lineage matches # index -> observations (repeated) - # columns -> "rank", "ref", "direct", "child", "parent" + # columns -> "rank", "ref", "direct", "parent" clist = [] for rank in table.ranks(): for obs in table.observations(rank): for desc, ref in references.items(): direct = ref.get_refs_count(obs, direct=True) - child = ref.get_refs_count(obs, children=True) parent = ref.get_refs_count(obs, parents=True) - if direct + child + parent > 0: - clist.append([obs, rank, desc, direct, child, parent]) + if direct + parent > 0: + clist.append([obs, rank, desc, direct, parent]) - df_references = pd.DataFrame(clist, columns=["obs", "rank", "ref", "direct", "child", "parent"]) + df_references = pd.DataFrame(clist, columns=["obs", "rank", "ref", "direct", "parent"]) df_references.set_index('obs', inplace=True) print_df(df_references, "df_references -> cds_p_references") diff --git a/grimer/grimer.py b/grimer/grimer.py index d289c85..eff1d2b 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -228,7 +228,7 @@ def main(): cds_p_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam) # df: index (unique sample-ids), aux|..., bar|..., tax|... cds_p_samplebars = generate_cds_bars(table) - # stacked: index (repeated observations), rank, ref, direct, child, parent + # stacked: index (repeated observations), rank, ref, direct, parent cds_p_references = generate_cds_plot_references(table, tax, references) # matrix: index (unique sample-ids), concentrations, controls, counts cds_p_decontam = generate_cds_plot_decontam(decontam) if decontam else None diff --git a/grimer/plots.py b/grimer/plots.py index aaa5a5c..851366f 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -447,7 +447,6 @@ def plot_references(sizes, table, cds_p_references, dict_d_taxname): tooltips=[ ('Observation', '@obs{custom}'), ('# reported (directly)', '@direct'), - ('as child', '@child'), ('as parent', '@parent'), ], mode="mouse", @@ -460,8 +459,8 @@ def plot_references(sizes, table, cds_p_references, dict_d_taxname): references_fig.add_layout(Legend(), 'above') - fixed_bar_options = ["direct", "child", "parent"] - palette = ["red", "orange", "black"] + fixed_bar_options = ["direct", "parent"] + palette = ["red", "black"] references_fig.vbar_stack(fixed_bar_options, x="rank", width=1, @@ -469,7 +468,7 @@ def plot_references(sizes, table, cds_p_references, dict_d_taxname): view=cds_view_references, color=palette, line_color=None, # to avoid printing small border for zeros - fill_alpha=[1, 0.3, 0.3], + fill_alpha=[1, 0.3], legend_label=fixed_bar_options) references_fig.y_range.start = 0 @@ -499,8 +498,6 @@ def plot_references_widgets(sizes, references): **direct** counts represent direct matches with reference identifiers -**child** counts accounts for the number of times a related parent on the lineage of the selected observation node was reported among references - **parent** counts accounts for the number of times related children (not necessarily reported) on the lineage of the selected observation node was reported among references """ diff --git a/grimer/reference.py b/grimer/reference.py index 97e9abf..a7e99a7 100644 --- a/grimer/reference.py +++ b/grimer/reference.py @@ -4,7 +4,6 @@ class Reference: def __init__(self, file: str=None, ids: list=[]): self.ids = {} # {refid: {ref1: set(desc1, desc2,...), ref2: set(desc3,...)}} - self.children = {} # {child_id: set(refids)} self.parents = {} # {parent_id: set(refids)} if file is not None: @@ -26,11 +25,6 @@ def add(self, i, ref: str=None, desc: str=None): if desc is not None: self.ids[i][ref].add(desc) - def add_child(self, child, refid): - if child not in self.children: - self.children[child] = set() - self.children[child].add(refid) - def add_parent(self, parent, refid): if parent not in self.parents: self.parents[parent] = set() @@ -57,17 +51,14 @@ def update_taxids(self, taxid_updated): self.ids[upd_node].update(self.ids[node]) self.ids.discard(node) - def get_refs_desc(self, i, direct: bool=False, children: bool=False, parents: bool=False): + def get_refs_desc(self, i, direct: bool=False, parents: bool=False): refs_desc = {} if direct and i in self.ids: refs_desc.update(self.ids[i]) - if children and i in self.children: - for refid in self.children[i]: - refs_desc.update(self.ids[refid]) if parents and i in self.parents: for refid in self.parents[i]: refs_desc.update(self.ids[refid]) return refs_desc - def get_refs_count(self, i, direct: bool=False, children: bool=False, parents: bool=False): - return len(self.get_refs_desc(i, direct, children, parents)) + def get_refs_count(self, i, direct: bool=False, parents: bool=False): + return len(self.get_refs_desc(i, direct, parents)) diff --git a/grimer/utils.py b/grimer/utils.py index 94af934..762adfe 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -516,10 +516,6 @@ def parse_references(cfg, tax, ranks): # Update taxids / get taxid from name references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) for i in list(references[desc].ids.keys()): - # lineage of all children nodes (without itself) - for lin in map(lambda txid: tax.lineage(txid, root_node=i), tax.leaves(i)): - for l in lin[1:]: - references[desc].add_child(l, i) # lineage of all parent nodes (without itself) for l in tax.lineage(i)[:-1]: references[desc].add_parent(l, i) From 502cb1fc96fedccdc08307dd0d0658dee5432d46 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 30 Sep 2021 14:50:17 +0200 Subject: [PATCH 09/50] fix decontam run --- grimer/utils.py | 2 +- scripts/run_decontam.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/grimer/utils.py b/grimer/utils.py index 762adfe..8dbf1c6 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -400,7 +400,7 @@ def run_decontam(cfg, table, metadata, control_samples): decontam.add_rank_empty(rank, table.observations(rank)) else: # normalized and write temporary table for each rank - transform_table(table.data[rank], table.total, "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) + transform_table(table.data[rank], table.total[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) cmd = " ".join(["scripts/run_decontam.R", "--resout " + tmp_output_prefix + "decontam_out.tsv", diff --git a/scripts/run_decontam.R b/scripts/run_decontam.R index 7b7a714..ea0e0c2 100755 --- a/scripts/run_decontam.R +++ b/scripts/run_decontam.R @@ -95,7 +95,7 @@ count_matrix <- data.matrix(data.frame(count_table[,-1], row.names = rows_table, # Load concentration table if(!args$concentrations==""){ concentrations <- read.table(file=args$concentrations, sep='\t', header=FALSE, check.names=FALSE) - concentrations_list <- concentrations[ , "V2"] + concentrations_list <- concentrations[ (concentrations[, "V1"] %in% rows_table) , "V2"] } # Load list of controls From 8de801015479ec292e07b8221dbedcaf635040e9 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 6 Oct 2021 11:27:52 +0200 Subject: [PATCH 10/50] correlation with prop. and clr --- grimer/callbacks.py | 3 - grimer/cds.py | 29 +- grimer/composition.py | 1639 ----------------------------------------- grimer/grimer.py | 8 +- grimer/layout.py | 1 - grimer/plots.py | 42 +- grimer/prop.py | 79 -- grimer/utils.py | 33 +- 8 files changed, 38 insertions(+), 1796 deletions(-) delete mode 100644 grimer/composition.py delete mode 100644 grimer/prop.py diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 114fa42..17cc1c8 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -649,13 +649,11 @@ def link_correlation_widgets(ele, cds_p_correlation): args=dict(rho_filter=ele["correlation"]["rho_filter"], neg_slider=ele["correlation"]["wid"]["neg_slider"], pos_slider=ele["correlation"]["wid"]["pos_slider"], - pval_spinner=ele["correlation"]["wid"]["pval_spinner"], cds_p_correlation=cds_p_correlation), code=''' console.log("filter_callback"); const indices = []; for (var i = 0; i < cds_p_correlation.data["index"].length; i++) { - if (cds_p_correlation.data["pval_corr"][i] > pval_spinner.value) continue; const rho = cds_p_correlation.data["rho"][i]; if ((rho >= neg_slider.value[0] && rho <= neg_slider.value[1]) || (rho >= pos_slider.value[0] && rho <= pos_slider.value[1])) @@ -669,7 +667,6 @@ def link_correlation_widgets(ele, cds_p_correlation): ele["correlation"]["wid"]["pos_slider"].js_on_change('value', filter_callback) ele["correlation"]["wid"]["neg_slider"].js_on_change('value', filter_callback) - ele["correlation"]["wid"]["pval_spinner"].js_on_change('value', filter_callback) ele["correlation"]["wid"]["rank_select"].js_on_change('value', rank_select_callback) diff --git a/grimer/cds.py b/grimer/cds.py index 9b3a792..75ddd60 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -4,14 +4,11 @@ from math import pi #Internal -from grimer.utils import print_df, transform_table, print_log, fdrcorrection_bh +from grimer.utils import print_df, transform_table, print_log, pairwise_rho #Bokeh from bokeh.models import ColumnDataSource -# Scipy -from scipy import stats - def generate_dict_taxname(tax, taxids): id_name = {} @@ -396,14 +393,13 @@ def generate_dict_refs(table, references): return d_refs -def generate_cds_correlation(table, top_obs_corr): +def generate_cds_correlation(table, top_obs_corr, replace_zero_value): # index (repeated taxids) # other taxid # rank # rho - # pval - df_corr = pd.DataFrame(columns=["taxid", "rank", "rho", "pval"]) + df_corr = pd.DataFrame(columns=["taxid", "rank", "rho"]) for rank in table.ranks(): if top_obs_corr: top_taxids = table.get_top(rank, top_obs_corr) @@ -412,24 +408,15 @@ def generate_cds_correlation(table, top_obs_corr): top_taxids = sorted(table.observations(rank)) matrix = table.data[rank] - matrix.to_csv(rank + "_top.tsv", sep="\t", header=True, index=True) - # No correlation with just one observation if len(matrix.columns) >= 2: - rho, pval = stats.spearmanr(matrix) - - #from grimer.prop import get_prop_matrix, rho - #from skbio.stats.composition import clr - - #rho = get_prop_matrix(transform_table(matrix, 0, "", 0.000001).values, rho, clr) - #print(rho) + rho = pairwise_rho(transform_table(matrix, 0, "clr", replace_zero_value).values) if len(matrix.columns) == 2: # If there are only 2 observations, return in a float # re-format in a matrix shape rho = np.array([[np.nan, np.nan], [rho, np.nan]]) - pval = np.array([[np.nan, np.nan], [pval, np.nan]]) else: # fill upper triangular matrix (mirrored values) with nan to be ignored by pandas # to save half of the space @@ -439,18 +426,10 @@ def generate_cds_correlation(table, top_obs_corr): stacked_rank_df.rename(columns={"level_1": "taxid"}, inplace=True) stacked_rank_df.rename(columns={0: "rho"}, inplace=True) stacked_rank_df["rank"] = rank - #stack pval - stacked_rank_df["pval"] = np.ravel(pval) # Drop NA for rho (missing values and upper triangular matrix) stacked_rank_df.dropna(subset=['rho'], inplace=True) - # Calculate corrected pvals - stacked_rank_df["pval_corr"] = fdrcorrection_bh(stacked_rank_df["pval"].to_list()) - - # Filter by p-value - #stacked_rank_df = stacked_rank_df[stacked_rank_df["pval_corr"] <= pval_cutoff] - df_corr = pd.concat([df_corr, stacked_rank_df], axis=0) print_df(df_corr, "df_corr -> cds_p_correlation") diff --git a/grimer/composition.py b/grimer/composition.py deleted file mode 100644 index 95a466b..0000000 --- a/grimer/composition.py +++ /dev/null @@ -1,1639 +0,0 @@ -r""" -Composition Statistics (:mod:`skbio.stats.composition`) -======================================================= - -.. currentmodule:: skbio.stats.composition - -This module provides functions for compositional data analysis. - -Many 'omics datasets are inherently compositional - meaning that they -are best interpreted as proportions or percentages rather than -absolute counts. - -Formally, :math:`x` is a composition if :math:`\sum_{i=0}^D x_{i} = c` -and :math:`x_{i} > 0`, :math:`1 \leq i \leq D` and :math:`c` is a real -valued constant and there are :math:`D` components for each -composition. In this module :math:`c=1`. Compositional data can be -analyzed using Aitchison geometry. [1]_ - -However, in this framework, standard real Euclidean operations such as -addition and multiplication no longer apply. Only operations such as -perturbation and power can be used to manipulate this data. - -This module allows two styles of manipulation of compositional data. -Compositional data can be analyzed using perturbation and power -operations, which can be useful for simulation studies. The -alternative strategy is to transform compositional data into the real -space. Right now, the centre log ratio transform (clr) and -the isometric log ratio transform (ilr) [2]_ can be used to accomplish -this. This transform can be useful for performing standard statistical -tools such as parametric hypothesis testing, regressions and more. - -The major caveat of using this framework is dealing with zeros. In -the Aitchison geometry, only compositions with nonzero components can -be considered. The multiplicative replacement technique [3]_ can be -used to substitute these zeros with small pseudocounts without -introducing major distortions to the data. - -Functions ---------- - -.. autosummary:: - :toctree: - - closure - multiplicative_replacement - perturb - perturb_inv - power - inner - clr - clr_inv - ilr - ilr_inv - alr - alr_inv - centralize - ancom - sbp_basis - -References ----------- -.. [1] V. Pawlowsky-Glahn, J. J. Egozcue, R. Tolosana-Delgado (2015), - Modeling and Analysis of Compositional Data, Wiley, Chichester, UK - -.. [2] J. J. Egozcue., "Isometric Logratio Transformations for - Compositional Data Analysis" Mathematical Geology, 35.3 (2003) - -.. [3] J. A. Martin-Fernandez, "Dealing With Zeros and Missing Values in - Compositional Data Sets Using Nonparametric Imputation", - Mathematical Geology, 35.3 (2003) - - -Examples --------- - ->>> import numpy as np - -Consider a very simple environment with only 3 species. The species -in the environment are equally distributed and their proportions are -equivalent: - ->>> otus = np.array([1./3, 1./3., 1./3]) - -Suppose that an antibiotic kills off half of the population for the -first two species, but doesn't harm the third species. Then the -perturbation vector would be as follows - ->>> antibiotic = np.array([1./2, 1./2, 1]) - -And the resulting perturbation would be - ->>> perturb(otus, antibiotic) -array([ 0.25, 0.25, 0.5 ]) - -""" - -# ---------------------------------------------------------------------------- -# Copyright (c) 2013--, scikit-bio development team. -# -# Distributed under the terms of the Modified BSD License. -# -# The full license is in the file COPYING.txt, distributed with this software. -# ---------------------------------------------------------------------------- -import numpy as np -import pandas as pd -import scipy.stats -import skbio.util -from skbio.util._decorator import experimental -from skbio.stats.distance import DistanceMatrix - - -@experimental(as_of="0.4.0") -def closure(mat): - """ - Performs closure to ensure that all elements add up to 1. - - Parameters - ---------- - mat : array_like - a matrix of proportions where - rows = compositions - columns = components - - Returns - ------- - array_like, np.float64 - A matrix of proportions where all of the values - are nonzero and each composition (row) adds up to 1 - - Raises - ------ - ValueError - Raises an error if any values are negative. - ValueError - Raises an error if the matrix has more than 2 dimension. - ValueError - Raises an error if there is a row that has all zeros. - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import closure - >>> X = np.array([[2, 2, 6], [4, 4, 2]]) - >>> closure(X) - array([[ 0.2, 0.2, 0.6], - [ 0.4, 0.4, 0.2]]) - - """ - mat = np.atleast_2d(mat) - if np.any(mat < 0): - raise ValueError("Cannot have negative proportions") - if mat.ndim > 2: - raise ValueError("Input matrix can only have two dimensions or less") - if np.all(mat == 0, axis=1).sum() > 0: - raise ValueError("Input matrix cannot have rows with all zeros") - mat = mat / mat.sum(axis=1, keepdims=True) - return mat.squeeze() - - -@experimental(as_of="0.4.0") -def multiplicative_replacement(mat, delta=None): - r"""Replace all zeros with small non-zero values - - It uses the multiplicative replacement strategy [1]_ , - replacing zeros with a small positive :math:`\delta` - and ensuring that the compositions still add up to 1. - - - Parameters - ---------- - mat: array_like - a matrix of proportions where - rows = compositions and - columns = components - delta: float, optional - a small number to be used to replace zeros - If delta is not specified, then the default delta is - :math:`\delta = \frac{1}{N^2}` where :math:`N` - is the number of components - - Returns - ------- - numpy.ndarray, np.float64 - A matrix of proportions where all of the values - are nonzero and each composition (row) adds up to 1 - - Raises - ------ - ValueError - Raises an error if negative proportions are created due to a large - `delta`. - - Notes - ----- - This method will result in negative proportions if a large delta is chosen. - - References - ---------- - .. [1] J. A. Martin-Fernandez. "Dealing With Zeros and Missing Values in - Compositional Data Sets Using Nonparametric Imputation" - - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import multiplicative_replacement - >>> X = np.array([[.2,.4,.4, 0],[0,.5,.5,0]]) - >>> multiplicative_replacement(X) - array([[ 0.1875, 0.375 , 0.375 , 0.0625], - [ 0.0625, 0.4375, 0.4375, 0.0625]]) - - """ - mat = closure(mat) - z_mat = (mat == 0) - - num_feats = mat.shape[-1] - tot = z_mat.sum(axis=-1, keepdims=True) - - if delta is None: - delta = (1. / num_feats)**2 - - zcnts = 1 - tot * delta - if np.any(zcnts) < 0: - raise ValueError('The multiplicative replacement created negative ' - 'proportions. Consider using a smaller `delta`.') - mat = np.where(z_mat, delta, zcnts * mat) - return mat.squeeze() - - -@experimental(as_of="0.4.0") -def perturb(x, y): - r""" - Performs the perturbation operation. - - This operation is defined as - - .. math:: - x \oplus y = C[x_1 y_1, \ldots, x_D y_D] - - :math:`C[x]` is the closure operation defined as - - .. math:: - C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, - \frac{x_D}{\sum_{i=1}^{D} x_i} \right] - - for some :math:`D` dimensional real vector :math:`x` and - :math:`D` is the number of components for every composition. - - Parameters - ---------- - x : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - y : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - - Returns - ------- - numpy.ndarray, np.float64 - A matrix of proportions where all of the values - are nonzero and each composition (row) adds up to 1 - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import perturb - >>> x = np.array([.1,.3,.4, .2]) - >>> y = np.array([1./6,1./6,1./3,1./3]) - >>> perturb(x,y) - array([ 0.0625, 0.1875, 0.5 , 0.25 ]) - - """ - x, y = closure(x), closure(y) - return closure(x * y) - - -@experimental(as_of="0.4.0") -def perturb_inv(x, y): - r""" - Performs the inverse perturbation operation. - - This operation is defined as - - .. math:: - x \ominus y = C[x_1 y_1^{-1}, \ldots, x_D y_D^{-1}] - - :math:`C[x]` is the closure operation defined as - - .. math:: - C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, - \frac{x_D}{\sum_{i=1}^{D} x_i} \right] - - - for some :math:`D` dimensional real vector :math:`x` and - :math:`D` is the number of components for every composition. - - Parameters - ---------- - x : array_like - a matrix of proportions where - rows = compositions and - columns = components - y : array_like - a matrix of proportions where - rows = compositions and - columns = components - - Returns - ------- - numpy.ndarray, np.float64 - A matrix of proportions where all of the values - are nonzero and each composition (row) adds up to 1 - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import perturb_inv - >>> x = np.array([.1,.3,.4, .2]) - >>> y = np.array([1./6,1./6,1./3,1./3]) - >>> perturb_inv(x,y) - array([ 0.14285714, 0.42857143, 0.28571429, 0.14285714]) - """ - x, y = closure(x), closure(y) - return closure(x / y) - - -@experimental(as_of="0.4.0") -def power(x, a): - r""" - Performs the power operation. - - This operation is defined as follows - - .. math:: - `x \odot a = C[x_1^a, \ldots, x_D^a] - - :math:`C[x]` is the closure operation defined as - - .. math:: - C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, - \frac{x_D}{\sum_{i=1}^{D} x_i} \right] - - for some :math:`D` dimensional real vector :math:`x` and - :math:`D` is the number of components for every composition. - - Parameters - ---------- - x : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - a : float - a scalar float - - Returns - ------- - numpy.ndarray, np.float64 - A matrix of proportions where all of the values - are nonzero and each composition (row) adds up to 1 - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import power - >>> x = np.array([.1,.3,.4, .2]) - >>> power(x, .1) - array([ 0.23059566, 0.25737316, 0.26488486, 0.24714631]) - - """ - x = closure(x) - return closure(x**a).squeeze() - - -@experimental(as_of="0.4.0") -def inner(x, y): - r""" - Calculates the Aitchson inner product. - - This inner product is defined as follows - - .. math:: - \langle x, y \rangle_a = - \frac{1}{2D} \sum\limits_{i=1}^{D} \sum\limits_{j=1}^{D} - \ln\left(\frac{x_i}{x_j}\right) \ln\left(\frac{y_i}{y_j}\right) - - Parameters - ---------- - x : array_like - a matrix of proportions where - rows = compositions and - columns = components - y : array_like - a matrix of proportions where - rows = compositions and - columns = components - - Returns - ------- - numpy.ndarray - inner product result - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import inner - >>> x = np.array([.1, .3, .4, .2]) - >>> y = np.array([.2, .4, .2, .2]) - >>> inner(x, y) # doctest: +ELLIPSIS - 0.2107852473... - """ - x = closure(x) - y = closure(y) - a, b = clr(x), clr(y) - return a.dot(b.T) - - -@experimental(as_of="0.4.0") -def clr(mat): - r""" - Performs centre log ratio transformation. - - This function transforms compositions from Aitchison geometry to - the real space. The :math:`clr` transform is both an isometry and an - isomorphism defined on the following spaces - - :math:`clr: S^D \rightarrow U` - - where :math:`U= - \{x :\sum\limits_{i=1}^D x = 0 \; \forall x \in \mathbb{R}^D\}` - - It is defined for a composition :math:`x` as follows: - - .. math:: - clr(x) = \ln\left[\frac{x_1}{g_m(x)}, \ldots, \frac{x_D}{g_m(x)}\right] - - where :math:`g_m(x) = (\prod\limits_{i=1}^{D} x_i)^{1/D}` is the geometric - mean of :math:`x`. - - Parameters - ---------- - mat : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - - Returns - ------- - numpy.ndarray - clr transformed matrix - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import clr - >>> x = np.array([.1, .3, .4, .2]) - >>> clr(x) - array([-0.79451346, 0.30409883, 0.5917809 , -0.10136628]) - - """ - mat = closure(mat) - lmat = np.log(mat) - gm = lmat.mean(axis=-1, keepdims=True) - return (lmat - gm).squeeze() - - -@experimental(as_of="0.4.0") -def clr_inv(mat): - r""" - Performs inverse centre log ratio transformation. - - This function transforms compositions from the real space to - Aitchison geometry. The :math:`clr^{-1}` transform is both an isometry, - and an isomorphism defined on the following spaces - - :math:`clr^{-1}: U \rightarrow S^D` - - where :math:`U= - \{x :\sum\limits_{i=1}^D x = 0 \; \forall x \in \mathbb{R}^D\}` - - This transformation is defined as follows - - .. math:: - clr^{-1}(x) = C[\exp( x_1, \ldots, x_D)] - - Parameters - ---------- - mat : array_like, float - a matrix of real values where - rows = transformed compositions and - columns = components - - Returns - ------- - numpy.ndarray - inverse clr transformed matrix - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import clr_inv - >>> x = np.array([.1, .3, .4, .2]) - >>> clr_inv(x) - array([ 0.21383822, 0.26118259, 0.28865141, 0.23632778]) - - """ - return closure(np.exp(mat)) - - -@experimental(as_of="0.4.0") -def ilr(mat, basis=None, check=True): - r""" - Performs isometric log ratio transformation. - - This function transforms compositions from Aitchison simplex to - the real space. The :math: ilr` transform is both an isometry, - and an isomorphism defined on the following spaces - - :math:`ilr: S^D \rightarrow \mathbb{R}^{D-1}` - - The ilr transformation is defined as follows - - .. math:: - ilr(x) = - [\langle x, e_1 \rangle_a, \ldots, \langle x, e_{D-1} \rangle_a] - - where :math:`[e_1,\ldots,e_{D-1}]` is an orthonormal basis in the simplex. - - If an orthornormal basis isn't specified, the J. J. Egozcue orthonormal - basis derived from Gram-Schmidt orthogonalization will be used by - default. - - Parameters - ---------- - mat: numpy.ndarray - a matrix of proportions where - rows = compositions and - columns = components - - basis: numpy.ndarray, float, optional - orthonormal basis for Aitchison simplex - defaults to J.J.Egozcue orthonormal basis. - - check: bool - Specifies if the basis is orthonormal. - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import ilr - >>> x = np.array([.1, .3, .4, .2]) - >>> ilr(x) - array([-0.7768362 , -0.68339802, 0.11704769]) - - Notes - ----- - If the `basis` parameter is specified, it is expected to be a basis in the - Aitchison simplex. If there are `D-1` elements specified in `mat`, then - the dimensions of the basis needs be `D-1 x D`, where rows represent - basis vectors, and the columns represent proportions. - """ - mat = closure(mat) - if basis is None: - basis = clr_inv(_gram_schmidt_basis(mat.shape[-1])) - else: - if len(basis.shape) != 2: - raise ValueError("Basis needs to be a 2D matrix, " - "not a %dD matrix." % - (len(basis.shape))) - if check: - _check_orthogonality(basis) - - return inner(mat, basis) - - -@experimental(as_of="0.4.0") -def ilr_inv(mat, basis=None, check=True): - r""" - Performs inverse isometric log ratio transform. - - This function transforms compositions from the real space to - Aitchison geometry. The :math:`ilr^{-1}` transform is both an isometry, - and an isomorphism defined on the following spaces - - :math:`ilr^{-1}: \mathbb{R}^{D-1} \rightarrow S^D` - - The inverse ilr transformation is defined as follows - - .. math:: - ilr^{-1}(x) = \bigoplus\limits_{i=1}^{D-1} x \odot e_i - - where :math:`[e_1,\ldots, e_{D-1}]` is an orthonormal basis in the simplex. - - If an orthonormal basis isn't specified, the J. J. Egozcue orthonormal - basis derived from Gram-Schmidt orthogonalization will be used by - default. - - - Parameters - ---------- - mat: numpy.ndarray, float - a matrix of transformed proportions where - rows = compositions and - columns = components - - basis: numpy.ndarray, float, optional - orthonormal basis for Aitchison simplex - defaults to J.J.Egozcue orthonormal basis - - check: bool - Specifies if the basis is orthonormal. - - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import ilr - >>> x = np.array([.1, .3, .6,]) - >>> ilr_inv(x) - array([ 0.34180297, 0.29672718, 0.22054469, 0.14092516]) - - Notes - ----- - If the `basis` parameter is specified, it is expected to be a basis in the - Aitchison simplex. If there are `D-1` elements specified in `mat`, then - the dimensions of the basis needs be `D-1 x D`, where rows represent - basis vectors, and the columns represent proportions. - """ - - if basis is None: - basis = _gram_schmidt_basis(mat.shape[-1] + 1) - else: - if len(basis.shape) != 2: - raise ValueError("Basis needs to be a 2D matrix, " - "not a %dD matrix." % - (len(basis.shape))) - if check: - _check_orthogonality(basis) - # this is necessary, since the clr function - # performs np.squeeze() - basis = np.atleast_2d(clr(basis)) - - return clr_inv(np.dot(mat, basis)) - - -@experimental(as_of="0.5.5") -def alr(mat, denominator_idx=0): - r""" - Performs additive log ratio transformation. - - This function transforms compositions from a D-part Aitchison simplex to - a non-isometric real space of D-1 dimensions. The argument - `denominator_col` defines the index of the column used as the common - denominator. The :math: `alr` transformed data are amenable to multivariate - analysis as long as statistics don't involve distances. - - :math:`alr: S^D \rightarrow \mathbb{R}^{D-1}` - - The alr transformation is defined as follows - - .. math:: - alr(x) = \left[ \ln \frac{x_1}{x_D}, \ldots, - \ln \frac{x_{D-1}}{x_D} \right] - - where :math:`D` is the index of the part used as common denominator. - - Parameters - ---------- - mat: numpy.ndarray - a matrix of proportions where - rows = compositions and - columns = components - - denominator_idx: int - the index of the column (2D-matrix) or position (vector) of - `mat` which should be used as the reference composition. By default - `denominator_idx=0` to specify the first column or position. - - Returns - ------- - numpy.ndarray - alr-transformed data projected in a non-isometric real space - of D-1 dimensions for a D-parts composition - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import alr - >>> x = np.array([.1, .3, .4, .2]) - >>> alr(x) - array([ 1.09861229, 1.38629436, 0.69314718]) - """ - mat = closure(mat) - if mat.ndim == 2: - mat_t = mat.T - numerator_idx = list(range(0, mat_t.shape[0])) - del numerator_idx[denominator_idx] - lr = np.log(mat_t[numerator_idx, :]/mat_t[denominator_idx, :]).T - elif mat.ndim == 1: - numerator_idx = list(range(0, mat.shape[0])) - del numerator_idx[denominator_idx] - lr = np.log(mat[numerator_idx]/mat[denominator_idx]) - else: - raise ValueError("mat must be either 1D or 2D") - return lr - - -@experimental(as_of="0.5.5") -def alr_inv(mat, denominator_idx=0): - r""" - Performs inverse additive log ratio transform. - - This function transforms compositions from the non-isometric real space of - alrs to Aitchison geometry. - - :math:`alr^{-1}: \mathbb{R}^{D-1} \rightarrow S^D` - - The inverse alr transformation is defined as follows - - .. math:: - alr^{-1}(x) = C[exp([y_1, y_2, ..., y_{D-1}, 0])] - - where :math:`C[x]` is the closure operation defined as - - .. math:: - C[x] = \left[\frac{x_1}{\sum_{i=1}^{D} x_i},\ldots, - \frac{x_D}{\sum_{i=1}^{D} x_i} \right] - - for some :math:`D` dimensional real vector :math:`x` and - :math:`D` is the number of components for every composition. - - Parameters - ---------- - mat: numpy.ndarray - a matrix of alr-transformed data - denominator_idx: int - the index of the column (2D-composition) or position (1D-composition) of - the output where the common denominator should be placed. By default - `denominator_idx=0` to specify the first column or position. - - Returns - ------- - numpy.ndarray - Inverse alr transformed matrix or vector where rows sum to 1. - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import alr, alr_inv - >>> x = np.array([.1, .3, .4, .2]) - >>> alr_inv(alr(x)) - array([ 0.1, 0.3, 0.4, 0.2]) - """ - mat = np.array(mat) - if mat.ndim == 2: - mat_idx = np.insert(mat, denominator_idx, - np.repeat(0, mat.shape[0]), axis=1) - comp = np.zeros(mat_idx.shape) - comp[:, denominator_idx] = 1 / (np.exp(mat).sum(axis=1) + 1) - numerator_idx = list(range(0, comp.shape[1])) - del numerator_idx[denominator_idx] - for i in numerator_idx: - comp[:, i] = comp[:, denominator_idx] * np.exp(mat_idx[:, i]) - elif mat.ndim == 1: - mat_idx = np.insert(mat, denominator_idx, 0, axis=0) - comp = np.zeros(mat_idx.shape) - comp[denominator_idx] = 1 / (np.exp(mat).sum(axis=0) + 1) - numerator_idx = list(range(0, comp.shape[0])) - del numerator_idx[denominator_idx] - for i in numerator_idx: - comp[i] = comp[denominator_idx] * np.exp(mat_idx[i]) - else: - raise ValueError("mat must be either 1D or 2D") - return comp - - -@experimental(as_of="0.4.0") -def centralize(mat): - r"""Center data around its geometric average. - - Parameters - ---------- - mat : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - - Returns - ------- - numpy.ndarray - centered composition matrix - - Examples - -------- - >>> import numpy as np - >>> from skbio.stats.composition import centralize - >>> X = np.array([[.1,.3,.4, .2],[.2,.2,.2,.4]]) - >>> centralize(X) - array([[ 0.17445763, 0.30216948, 0.34891526, 0.17445763], - [ 0.32495488, 0.18761279, 0.16247744, 0.32495488]]) - - """ - mat = closure(mat) - cen = scipy.stats.gmean(mat, axis=0) - return perturb_inv(mat, cen) - - - -@experimental(as_of="0.5.7") -def _vlr(x:np.array, y:np.array, ddof:int): - r""" - Calculates variance log ratio - - Parameters - ---------- - x : array_like, float - a 1-dimensional vector of proportions - y : array_like, float - a 1-dimensional vector of proportions - - ddof: int - degrees of freedom - - - Returns - ------- - float - variance log ratio value - - - References - ---------- - .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) - Proportionality: A Valid Alternative to Correlation for Relative Data. - PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 - .. [2] Erb, I., Notredame, C. - How should we measure proportionality on relative gene expression data?. - Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 - """ - - # Log transformation - x = np.log(x) - y = np.log(y) - - #Variance log ratio - return np.var(x - y, ddof=ddof) - - -@experimental(as_of="0.5.7") -def _robust_vlr(x:np.array, y:np.array, ddof:int): - r""" - Calculates variance log ratio while masking zeros - - Parameters - ---------- - x : array_like, float - a 1-dimensional vector of proportions - y : array_like, float - a 1-dimensional vector of proportions - - ddof: int - degrees of freedom - - Returns - ------- - float - variance log ratio value - - - References - ---------- - .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) - Proportionality: A Valid Alternative to Correlation for Relative Data. - PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 - .. [2] Erb, I., Notredame, C. - How should we measure proportionality on relative gene expression data?. - Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 - """ - - # Mask zeros - x = np.ma.masked_array(x, mask=x == 0) - y = np.ma.masked_array(y, mask=y == 0) - - # Log transformation - x = np.ma.log(x) - y = np.ma.log(y) - - #Variance log ratio - return np.ma.var(x - y, ddof=ddof) - -@experimental(as_of="0.5.7") -def vlr(x:np.array, y:np.array, ddof:int=1, robust:bool=False): - r""" - Calculates variance log ratio - - Parameters - ---------- - x : array_like, float - a 1-dimensional vector of proportions - y : array_like, float - a 1-dimensional vector of proportions - - ddof: int - degrees of freedom - - robust: bool - mask zeros at the cost of performance - - Returns - ------- - float - variance log ratio value - - Examples - -------- - No zeros - >>> x = [1,2,3] - >>> y = [5,8,13] - >>> %timeit vlr(x,y) - >>> 0.01277962183258352 - - Zeros without robust - >>> x = [1,2,3,0] - >>> y = [5,8,13,21] - >>> vlr(x,y) - >>> nan - - Zeros with robust - >>> x = [1,2,3,0] - >>> y = [5,8,13,21] - >>> vlr(x,y, robust=True) - >>> 0.01277962183258352 - - References - ---------- - .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) - Proportionality: A Valid Alternative to Correlation for Relative Data. - PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 - .. [2] Erb, I., Notredame, C. - How should we measure proportionality on relative gene expression data?. - Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 - """ - # Convert array_like to numpy array - x = np.asarray(x) - y = np.asarray(y) - - # Set up input and parameters - kwargs = { - "x":x, - "y":y, - "ddof":ddof, - } - - # Run backend function - if robust: - return _robust_vlr(**kwargs) - else: - return _vlr(**kwargs) - - -@experimental(as_of="0.5.7") -def _pairwise_vlr(mat:np.array, ddof:int): - r""" - Performs pairwise variance log ratio transformation - - Parameters - ---------- - mat : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - - ids: array_like, str - component names - - ddof: int - degrees of freedom - - Returns - ------- - skbio.DistanceMatrix - distance matrix of variance log ratio values - - - References - ---------- - .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) - Proportionality: A Valid Alternative to Correlation for Relative Data. - PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 - .. [2] Erb, I., Notredame, C. - How should we measure proportionality on relative gene expression data?. - Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 - """ - - # Log Transform - X_log = np.log(mat) - - # Variance Log Ratio - covariance = np.cov(X_log.T, ddof=ddof) - diagonal = np.diagonal(covariance) - vlr_data = -2*covariance + diagonal[:,np.newaxis] + diagonal - return vlr_data - -@experimental(as_of="0.5.7") -def _robust_pairwise_vlr(mat:np.array, ddof:int): - r""" - Performs pairwise variance log ratio transformation while masking zeros - - Parameters - ---------- - mat : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - - ids: array_like, str - component names - - ddof: int - degrees of freedom - - Returns - ------- - skbio.DistanceMatrix - distance matrix of variance log ratio values - - - References - ---------- - .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) - Proportionality: A Valid Alternative to Correlation for Relative Data. - PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 - .. [2] Erb, I., Notredame, C. - How should we measure proportionality on relative gene expression data?. - Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 - """ - # Mask zeros - X = np.ma.masked_array(mat, mask=mat == 0) - - # Log Transform - X_log = np.ma.log(X) - - # Variance Log Ratio - covariance = np.ma.cov(X_log.T, ddof=ddof) - diagonal = np.ma.diagonal(covariance) - vlr_data = -2*covariance + diagonal[:,np.newaxis] + diagonal - return vlr_data - -@experimental(as_of="0.5.7") -def pairwise_vlr(mat, ids=None, ddof=1, robust=False) -> DistanceMatrix: - r""" - Performs pairwise variance log ratio transformation - - Parameters - ---------- - mat : array_like, float - a matrix of proportions where - rows = compositions and - columns = components - - ids: array_like, str - component names - - ddof: int - degrees of freedom - - robust: bool - mask zeros at the cost of performance - - Returns - ------- - skbio.DistanceMatrix - distance matrix of variance log ratio values - - Examples - -------- - >>> mat = np.asarray([ - [1,2,3], - [5,8,13], - [21,34,55], - ]) - >>> dism = pairwise_vlr(mat) - >>> dism.redundant_form() - array([[0. , 0.01576411, 0.00649553], - [0.01576411, 0. , 0.00202147], - [0.00649553, 0.00202147, 0. ]]) - - - References - ---------- - .. [1] V. Lovell D, Pawlowsky-Glahn V, Egozcue JJ, Marguerat S, Bähler J (2015) - Proportionality: A Valid Alternative to Correlation for Relative Data. - PLoS Comput Biol 11(3): e1004075. https://doi.org/10.1371/journal.pcbi.1004075 - .. [2] Erb, I., Notredame, C. - How should we measure proportionality on relative gene expression data?. - Theory Biosci. 135, 21–36 (2016). https://doi.org/10.1007/s12064-015-0220-8 - """ - - # Mask zeros - mat = mat.astype(np.float64) - mat = closure(mat) - # Set up input and parameters - kwargs = { - "mat":mat, - "ddof":ddof, - } - - # Variance Log Ratio - if robust: - vlr_data = _robust_pairwise_vlr(**kwargs) - else: - vlr_data = _pairwise_vlr(**kwargs) - - # Force symmetry - vlr_data = (vlr_data + vlr_data.T)/2 - - # Create distance matrix - return DistanceMatrix(vlr_data, ids=ids) - - -@experimental(as_of="0.4.1") -def ancom(table, grouping, - alpha=0.05, - tau=0.02, - theta=0.1, - multiple_comparisons_correction='holm-bonferroni', - significance_test=None, - percentiles=(0.0, 25.0, 50.0, 75.0, 100.0)): - r""" Performs a differential abundance test using ANCOM. - - This is done by calculating pairwise log ratios between all features - and performing a significance test to determine if there is a significant - difference in feature ratios with respect to the variable of interest. - - In an experiment with only two treatments, this tests the following - hypothesis for feature :math:`i` - - .. math:: - - H_{0i}: \mathbb{E}[\ln(u_i^{(1)})] = \mathbb{E}[\ln(u_i^{(2)})] - - where :math:`u_i^{(1)}` is the mean abundance for feature :math:`i` in the - first group and :math:`u_i^{(2)}` is the mean abundance for feature - :math:`i` in the second group. - - Parameters - ---------- - table : pd.DataFrame - A 2D matrix of strictly positive values (i.e. counts or proportions) - where the rows correspond to samples and the columns correspond to - features. - grouping : pd.Series - Vector indicating the assignment of samples to groups. For example, - these could be strings or integers denoting which group a sample - belongs to. It must be the same length as the samples in `table`. - The index must be the same on `table` and `grouping` but need not be - in the same order. - alpha : float, optional - Significance level for each of the statistical tests. - This can can be anywhere between 0 and 1 exclusive. - tau : float, optional - A constant used to determine an appropriate cutoff. - A value close to zero indicates a conservative cutoff. - This can can be anywhere between 0 and 1 exclusive. - theta : float, optional - Lower bound for the proportion for the W-statistic. - If all W-statistics are lower than theta, then no features - will be detected to be differentially significant. - This can can be anywhere between 0 and 1 exclusive. - multiple_comparisons_correction : {None, 'holm-bonferroni'}, optional - The multiple comparison correction procedure to run. If None, - then no multiple comparison correction procedure will be run. - If 'holm-boniferroni' is specified, then the Holm-Boniferroni - procedure [1]_ will be run. - significance_test : function, optional - A statistical significance function to test for significance between - classes. This function must be able to accept at least two 1D - array_like arguments of floats and returns a test statistic and a - p-value. By default ``scipy.stats.f_oneway`` is used. - percentiles : iterable of floats, optional - Percentile abundances to return for each feature in each group. By - default, will return the minimum, 25th percentile, median, 75th - percentile, and maximum abundances for each feature in each group. - - Returns - ------- - pd.DataFrame - A table of features, their W-statistics and whether the null hypothesis - is rejected. - - `"W"` is the W-statistic, or number of features that a single feature - is tested to be significantly different against. - - `"Reject null hypothesis"` indicates if feature is differentially - abundant across groups (`True`) or not (`False`). - - pd.DataFrame - A table of features and their percentile abundances in each group. If - ``percentiles`` is empty, this will be an empty ``pd.DataFrame``. The - rows in this object will be features, and the columns will be a - multi-index where the first index is the percentile, and the second - index is the group. - - See Also - -------- - multiplicative_replacement - scipy.stats.ttest_ind - scipy.stats.f_oneway - scipy.stats.wilcoxon - scipy.stats.kruskal - - Notes - ----- - The developers of this method recommend the following significance tests - ([2]_, Supplementary File 1, top of page 11): if there are 2 groups, use - the standard parametric t-test (``scipy.stats.ttest_ind``) or - non-parametric Wilcoxon rank sum test (``scipy.stats.wilcoxon``). - If there are more than 2 groups, use parametric one-way ANOVA - (``scipy.stats.f_oneway``) or nonparametric Kruskal-Wallis - (``scipy.stats.kruskal``). Because one-way ANOVA is equivalent - to the standard t-test when the number of groups is two, we default to - ``scipy.stats.f_oneway`` here, which can be used when there are two or - more groups. Users should refer to the documentation of these tests in - SciPy to understand the assumptions made by each test. - - This method cannot handle any zero counts as input, since the logarithm - of zero cannot be computed. While this is an unsolved problem, many - studies, including [2]_, have shown promising results by adding - pseudocounts to all values in the matrix. In [2]_, a pseudocount of 0.001 - was used, though the authors note that a pseudocount of 1.0 may also be - useful. Zero counts can also be addressed using the - ``multiplicative_replacement`` method. - - References - ---------- - .. [1] Holm, S. "A simple sequentially rejective multiple test procedure". - Scandinavian Journal of Statistics (1979), 6. - .. [2] Mandal et al. "Analysis of composition of microbiomes: a novel - method for studying microbial composition", Microbial Ecology in Health - & Disease, (2015), 26. - - Examples - -------- - First import all of the necessary modules: - - >>> from skbio.stats.composition import ancom - >>> import pandas as pd - - Now let's load in a DataFrame with 6 samples and 7 features (e.g., - these may be bacterial OTUs): - - >>> table = pd.DataFrame([[12, 11, 10, 10, 10, 10, 10], - ... [9, 11, 12, 10, 10, 10, 10], - ... [1, 11, 10, 11, 10, 5, 9], - ... [22, 21, 9, 10, 10, 10, 10], - ... [20, 22, 10, 10, 13, 10, 10], - ... [23, 21, 14, 10, 10, 10, 10]], - ... index=['s1', 's2', 's3', 's4', 's5', 's6'], - ... columns=['b1', 'b2', 'b3', 'b4', 'b5', 'b6', - ... 'b7']) - - Then create a grouping vector. In this example, there is a treatment group - and a placebo group. - - >>> grouping = pd.Series(['treatment', 'treatment', 'treatment', - ... 'placebo', 'placebo', 'placebo'], - ... index=['s1', 's2', 's3', 's4', 's5', 's6']) - - Now run ``ancom`` to determine if there are any features that are - significantly different in abundance between the treatment and the placebo - groups. The first DataFrame that is returned contains the ANCOM test - results, and the second contains the percentile abundance data for each - feature in each group. - - >>> ancom_df, percentile_df = ancom(table, grouping) - >>> ancom_df['W'] - b1 0 - b2 4 - b3 0 - b4 1 - b5 1 - b6 0 - b7 1 - Name: W, dtype: int64 - - The W-statistic is the number of features that a single feature is tested - to be significantly different against. In this scenario, `b2` was detected - to have significantly different abundances compared to four of the other - features. To summarize the results from the W-statistic, let's take a look - at the results from the hypothesis test. The `Reject null hypothesis` - column in the table indicates whether the null hypothesis was rejected, - and that a feature was therefore observed to be differentially abundant - across the groups. - - >>> ancom_df['Reject null hypothesis'] - b1 False - b2 True - b3 False - b4 False - b5 False - b6 False - b7 False - Name: Reject null hypothesis, dtype: bool - - From this we can conclude that only `b2` was significantly different in - abundance between the treatment and the placebo. We still don't know, for - example, in which group `b2` was more abundant. We therefore may next be - interested in comparing the abundance of `b2` across the two groups. - We can do that using the second DataFrame that was returned. Here we - compare the median (50th percentile) abundance of `b2` in the treatment and - placebo groups: - - >>> percentile_df[50.0].loc['b2'] - Group - placebo 21.0 - treatment 11.0 - Name: b2, dtype: float64 - - We can also look at a full five-number summary for ``b2`` in the treatment - and placebo groups: - - >>> percentile_df.loc['b2'] # doctest: +NORMALIZE_WHITESPACE - Percentile Group - 0.0 placebo 21.0 - 25.0 placebo 21.0 - 50.0 placebo 21.0 - 75.0 placebo 21.5 - 100.0 placebo 22.0 - 0.0 treatment 11.0 - 25.0 treatment 11.0 - 50.0 treatment 11.0 - 75.0 treatment 11.0 - 100.0 treatment 11.0 - Name: b2, dtype: float64 - - Taken together, these data tell us that `b2` is present in significantly - higher abundance in the placebo group samples than in the treatment group - samples. - - """ - if not isinstance(table, pd.DataFrame): - raise TypeError('`table` must be a `pd.DataFrame`, ' - 'not %r.' % type(table).__name__) - if not isinstance(grouping, pd.Series): - raise TypeError('`grouping` must be a `pd.Series`,' - ' not %r.' % type(grouping).__name__) - - if np.any(table <= 0): - raise ValueError('Cannot handle zeros or negative values in `table`. ' - 'Use pseudocounts or ``multiplicative_replacement``.' - ) - - if not 0 < alpha < 1: - raise ValueError('`alpha`=%f is not within 0 and 1.' % alpha) - - if not 0 < tau < 1: - raise ValueError('`tau`=%f is not within 0 and 1.' % tau) - - if not 0 < theta < 1: - raise ValueError('`theta`=%f is not within 0 and 1.' % theta) - - if multiple_comparisons_correction is not None: - if multiple_comparisons_correction != 'holm-bonferroni': - raise ValueError('%r is not an available option for ' - '`multiple_comparisons_correction`.' - % multiple_comparisons_correction) - - if (grouping.isnull()).any(): - raise ValueError('Cannot handle missing values in `grouping`.') - - if (table.isnull()).any().any(): - raise ValueError('Cannot handle missing values in `table`.') - - percentiles = list(percentiles) - for percentile in percentiles: - if not 0.0 <= percentile <= 100.0: - raise ValueError('Percentiles must be in the range [0, 100], %r ' - 'was provided.' % percentile) - - duplicates = skbio.util.find_duplicates(percentiles) - if duplicates: - formatted_duplicates = ', '.join(repr(e) for e in duplicates) - raise ValueError('Percentile values must be unique. The following' - ' value(s) were duplicated: %s.' % - formatted_duplicates) - - groups = np.unique(grouping) - num_groups = len(groups) - - if num_groups == len(grouping): - raise ValueError( - "All values in `grouping` are unique. This method cannot " - "operate on a grouping vector with only unique values (e.g., " - "there are no 'within' variance because each group of samples " - "contains only a single sample).") - - if num_groups == 1: - raise ValueError( - "All values the `grouping` are the same. This method cannot " - "operate on a grouping vector with only a single group of samples" - "(e.g., there are no 'between' variance because there is only a " - "single group).") - - if significance_test is None: - significance_test = scipy.stats.f_oneway - - table_index_len = len(table.index) - grouping_index_len = len(grouping.index) - mat, cats = table.align(grouping, axis=0, join='inner') - if (len(mat) != table_index_len or len(cats) != grouping_index_len): - raise ValueError('`table` index and `grouping` ' - 'index must be consistent.') - - n_feat = mat.shape[1] - - _logratio_mat = _log_compare(mat.values, cats.values, significance_test) - logratio_mat = _logratio_mat + _logratio_mat.T - - # Multiple comparisons - if multiple_comparisons_correction == 'holm-bonferroni': - logratio_mat = np.apply_along_axis(_holm_bonferroni, - 1, logratio_mat) - np.fill_diagonal(logratio_mat, 1) - W = (logratio_mat < alpha).sum(axis=1) - c_start = W.max() / n_feat - if c_start < theta: - reject = np.zeros_like(W, dtype=bool) - else: - # Select appropriate cutoff - cutoff = c_start - np.linspace(0.05, 0.25, 5) - prop_cut = np.array([(W > n_feat*cut).mean() for cut in cutoff]) - dels = np.abs(prop_cut - np.roll(prop_cut, -1)) - dels[-1] = 0 - - if (dels[0] < tau) and (dels[1] < tau) and (dels[2] < tau): - nu = cutoff[1] - elif (dels[0] >= tau) and (dels[1] < tau) and (dels[2] < tau): - nu = cutoff[2] - elif (dels[1] >= tau) and (dels[2] < tau) and (dels[3] < tau): - nu = cutoff[3] - else: - nu = cutoff[4] - reject = (W >= nu*n_feat) - - feat_ids = mat.columns - ancom_df = pd.DataFrame( - {'W': pd.Series(W, index=feat_ids), - 'Reject null hypothesis': pd.Series(reject, index=feat_ids)}) - - if len(percentiles) == 0: - return ancom_df, pd.DataFrame() - else: - data = [] - columns = [] - for group in groups: - feat_dists = mat[cats == group] - for percentile in percentiles: - columns.append((percentile, group)) - data.append(np.percentile(feat_dists, percentile, axis=0)) - columns = pd.MultiIndex.from_tuples(columns, - names=['Percentile', 'Group']) - percentile_df = pd.DataFrame( - np.asarray(data).T, columns=columns, index=feat_ids) - return ancom_df, percentile_df - - -def _holm_bonferroni(p): - """ Performs Holm-Bonferroni correction for pvalues - to account for multiple comparisons - - Parameters - --------- - p: numpy.array - array of pvalues - - Returns - ------- - numpy.array - corrected pvalues - """ - K = len(p) - sort_index = -np.ones(K, dtype=np.int64) - sorted_p = np.sort(p) - sorted_p_adj = sorted_p*(K-np.arange(K)) - for j in range(K): - idx = (p == sorted_p[j]) & (sort_index < 0) - num_ties = len(sort_index[idx]) - sort_index[idx] = np.arange(j, (j+num_ties), dtype=np.int64) - - sorted_holm_p = [min([max(sorted_p_adj[:k]), 1]) - for k in range(1, K+1)] - holm_p = [sorted_holm_p[sort_index[k]] for k in range(K)] - return holm_p - - -def _log_compare(mat, cats, - significance_test=scipy.stats.ttest_ind): - """ Calculates pairwise log ratios between all features and performs a - significiance test (i.e. t-test) to determine if there is a significant - difference in feature ratios with respect to the variable of interest. - - Parameters - ---------- - mat: np.array - rows correspond to samples and columns correspond to - features (i.e. OTUs) - cats: np.array, float - Vector of categories - significance_test: function - statistical test to run - - Returns: - -------- - log_ratio : np.array - log ratio pvalue matrix - """ - r, c = mat.shape - log_ratio = np.zeros((c, c)) - log_mat = np.log(mat) - cs = np.unique(cats) - - def func(x): - return significance_test(*[x[cats == k] for k in cs]) - - for i in range(c-1): - ratio = (log_mat[:, i].T - log_mat[:, i+1:].T).T - m, p = np.apply_along_axis(func, - axis=0, - arr=ratio) - log_ratio[i, i+1:] = np.squeeze(np.array(p.T)) - return log_ratio - - -def _gram_schmidt_basis(n): - """ - Builds clr transformed basis derived from - gram schmidt orthogonalization - - Parameters - ---------- - n : int - Dimension of the Aitchison simplex - """ - basis = np.zeros((n, n-1)) - for j in range(n-1): - i = j + 1 - e = np.array([(1/i)]*i + [-1] + - [0]*(n-i-1))*np.sqrt(i/(i+1)) - basis[:, j] = e - return basis.T - - -@experimental(as_of="0.5.5") -def sbp_basis(sbp): - r""" - Builds an orthogonal basis from a sequential binary partition (SBP). As - explained in [1]_, the SBP is a hierarchical collection of binary - divisions of compositional parts. The child groups are divided again until - all groups contain a single part. The SBP can be encoded in a - :math:`(D - 1) \times D` matrix where, for each row, parts can be grouped - by -1 and +1 tags, and 0 for excluded parts. The `sbp_basis` method was - originally derived from function `gsi.buildilrBase()` found in the R - package `compositions` [2]_. The ith balance is computed as follows - - .. math:: - b_i = \sqrt{ \frac{r_i s_i}{r_i+s_i} } - \ln \left( \frac{g(x_{r_i})}{g(x_{s_i})} \right) - - where :math:`b_i` is the ith balance corresponding to the ith row in the - SBP, :math:`r_i` and :math:`s_i` and the number of respectively `+1` and - `-1` labels in the ith row of the SBP and where :math:`g(x) = - (\prod\limits_{i=1}^{D} x_i)^{1/D}` is the geometric mean of :math:`x`. - - Parameters - ---------- - sbp: np.array, int - A contrast matrix, also known as a sequential binary partition, where - every row represents a partition between two groups of features. A part - labelled `+1` would correspond to that feature being in the numerator - of the given row partition, a part labelled `-1` would correspond to - features being in the denominator of that given row partition, and `0` - would correspond to features excluded in the row partition. - - Returns - ------- - numpy.array - An orthonormal basis in the Aitchison simplex - - Examples - -------- - >>> import numpy as np - >>> sbp = np.array([[1, 1,-1,-1,-1], - ... [1,-1, 0, 0, 0], - ... [0, 0, 1,-1,-1], - ... [0, 0, 0, 1,-1]]) - ... - >>> sbp_basis(sbp) - array([[ 0.31209907, 0.31209907, 0.12526729, 0.12526729, 0.12526729], - [ 0.36733337, 0.08930489, 0.18112058, 0.18112058, 0.18112058], - [ 0.17882092, 0.17882092, 0.40459293, 0.11888261, 0.11888261], - [ 0.18112058, 0.18112058, 0.18112058, 0.36733337, 0.08930489]]) - - References - ---------- - .. [1] Parent, S.É., Parent, L.E., Egozcue, J.J., Rozane, D.E., - Hernandes, A., Lapointe, L., Hébert-Gentile, V., Naess, K., - Marchand, S., Lafond, J., Mattos, D., Barlow, P., Natale, W., 2013. - The plant ionome revisited by the nutrient balance concept. - Front. Plant Sci. 4, 39, http://dx.doi.org/10.3389/fpls.2013.00039. - .. [2] van den Boogaart, K. Gerald, Tolosana-Delgado, Raimon and Bren, - Matevz, 2014. `compositions`: Compositional Data Analysis. R package - version 1.40-1. https://CRAN.R-project.org/package=compositions. - """ - - n_pos = (sbp == 1).sum(axis=1) - n_neg = (sbp == -1).sum(axis=1) - psi = np.zeros(sbp.shape) - for i in range(0, sbp.shape[0]): - psi[i, :] = sbp[i, :] * np.sqrt((n_neg[i] / n_pos[i])**sbp[i, :] / - np.sum(np.abs(sbp[i, :]))) - return clr_inv(psi) - - -def _check_orthogonality(basis): - """ - Checks to see if basis is truly orthonormal in the - Aitchison simplex - - Parameters - ---------- - basis: numpy.ndarray - basis in the Aitchison simplex - """ - basis = np.atleast_2d(basis) - if not np.allclose(inner(basis, basis), np.identity(len(basis)), - rtol=1e-4, atol=1e-6): - raise ValueError("Aitchison basis is not orthonormal") diff --git a/grimer/grimer.py b/grimer/grimer.py index eff1d2b..33eec25 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -244,8 +244,8 @@ def main(): cds_p_annotations = generate_cds_annotations(table, references, controls, decontam) # empty matrix {"x": [], "y": [], "c": []} cds_p_dendro_x, cds_p_dendro_y = generate_cds_plot_dendro() if not args.skip_dendrogram else [None, None] - # stacked: index (repeated observations), other observation, rank, rho, pval, pval_corr - cds_p_correlation = generate_cds_correlation(table, args.top_obs_corr) + # stacked: index (repeated observations), other observation, rank, rho + cds_p_correlation = generate_cds_correlation(table, args.top_obs_corr, replace_zero_value) # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors cds_p_obsbars = generate_cds_obsbars(table, args.top_obs_bars) @@ -279,7 +279,7 @@ def main(): sizes = {} sizes["overview_top_panel_height"] = 300 sizes["overview_top_panel_width_left"] = 250 - sizes["overview_top_panel_width_right"] = 500 + sizes["overview_top_panel_width_right"] = 450 ele = {} @@ -349,7 +349,7 @@ def main(): # correlation ele["correlation"] = {} - ele["correlation"]["fig"], ele["correlation"]["rho_filter"], ele["correlation"]["pval_filter"] = plot_correlation(cds_p_correlation, table.ranks(), dict_d_taxname) + ele["correlation"]["fig"], ele["correlation"]["rho_filter"] = plot_correlation(cds_p_correlation, table.ranks(), dict_d_taxname) ele["correlation"]["wid"] = plot_correlation_widgets(table.ranks(), args.top_obs_corr) # obsbars diff --git a/grimer/layout.py b/grimer/layout.py index 30d4e6c..0d9dab3 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -83,7 +83,6 @@ def make_layout(ele, sizes, version, logo_path, title): row_correlation = row(column(ele["correlation"]["wid"]["rank_select"], ele["correlation"]["wid"]["neg_slider"], ele["correlation"]["wid"]["pos_slider"], - ele["correlation"]["wid"]["pval_spinner"], ele["correlation"]["wid"]["help_button"]), ele["correlation"]["fig"]) diff --git a/grimer/plots.py b/grimer/plots.py index 851366f..c37da48 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -94,10 +94,7 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna y_range=Range1d(start=0, end=100), height=450, sizing_mode="stretch_width", - tools="box_zoom,reset,hover,save", - tooltips=[("Sample", "@index"), - ("Label", "$name"), - ("Value", "@$name")]) + tools="box_zoom,reset,save") # TODO Need to know which rank to get the correct set of top taxa # taxid_name_custom = CustomJSHover( @@ -108,15 +105,16 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna # //console.log(dict_d_topobs); # //return dict_d_taxname.data.dict_d_taxname[0][taxid]; // value holds the @taxid # ''') - # # Add custom tooltip for heatmap (taxid->name) - # obsbars.add_tools(HoverTool( - # tooltips=[ - # ('Sample', '@index'), - # ('Taxa', '$name{custom}'), - # ("Value", "@$name") - # ], - # formatters={"$name": taxid_name_custom} - # )) + + # Add custom tooltip for heatmap (taxid->name) + obsbars_fig.add_tools(HoverTool( + tooltips=[("Sample", "@index"), + ("Label", "$name"), + ("Value", "@$name")], + mode="mouse", + point_policy="follow_mouse", + #formatters={"$name": taxid_name_custom} + )) bars = [str(i) for i in range(top_obs_bars)] + ["others", "unassigned"] # Plot stacked bars with counts @@ -132,6 +130,7 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna obsbars_fig.xaxis.subgroup_label_orientation = "vertical" obsbars_fig.xaxis.major_label_text_font_size = "8px" obsbars_fig.xgrid.grid_line_color = None + obsbars_fig.ygrid.grid_line_color = None obsbars_fig.xaxis.axis_label = "samples" obsbars_fig.yaxis.axis_label = "% counts" @@ -863,9 +862,7 @@ def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): tooltips=[ ('x', '@taxid{custom}'), ('y', '@index{custom}'), - ('Correlation coefficient (rho)', '@rho'), - ('Raw p-value', '@pval'), - ('Corrected p-value', '@pval_corr') + ('Correlation (rho)', '@rho'), ], formatters={"@taxid": taxid_name_custom, "@index": taxid_name_custom} )) @@ -874,8 +871,7 @@ def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): color_mapper = LinearColorMapper(palette=color_palette, low=-1, high=1) rho_filter = IndexFilter() - pval_filter = IndexFilter() - cds_view_correlation = CDSView(source=cds_p_correlation, filters=[rho_filter, pval_filter]) + cds_view_correlation = CDSView(source=cds_p_correlation, filters=[rho_filter]) corr_fig.rect(x="taxid", y="index", width=1, height=1, source=cds_p_correlation, @@ -904,27 +900,27 @@ def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): corr_fig.yaxis.minor_tick_line_color = None corr_fig.xaxis.major_label_orientation = "vertical" - return corr_fig, rho_filter, pval_filter + return corr_fig, rho_filter def plot_correlation_widgets(ranks, top_obs_corr): rank_select = Select(title="Taxonomic rank:", value=ranks[0], options=ranks) neg_slider = RangeSlider(start=-1, end=0, value=(-1, 0), step=.01, title="Negative correlation") pos_slider = RangeSlider(start=0, end=1, value=(0, 1), step=.01, title="Positive correlation") - pval_spinner = Spinner(title="Corrected P-value", low=0, high=1, step=0.01, value=1, width=100, height=50) help_text = """ -Spearman correlation coefficient with associated and corrected (Benjamini/Hochberg) p-values between the top """ + str(top_obs_corr) + """ most abundant observations. [spearmanr](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html) function from scipy is used. +Symmetric proportionality coefficient (rho correlation) [1,2] between the top """ + str(top_obs_corr) + """ most abundant observations, based on log-ratios (clr). Only half matrix is displayed, since the values are symmetric. - Negative correlation values [-1 .. 0] are displayed in blue. - Positive correlation values [0 .. 1] are displayed in red. -Only half matrix is displayed, since the values are symmetric. Widgets can control level of correlation and corrected p-value to show. +[1] Lovell, D., Pawlowsky-Glahn, V., Egozcue, J. J., Marguerat, S. & Bähler, J. Proportionality: A Valid Alternative to Correlation for Relative Data. PLOS Computational Biology 11, e1004075 (2015). + +[2] Erb, I. & Notredame, C. How should we measure proportionality on relative gene expression data? Theory Biosci. 135, 21–36 (2016). """ return {"rank_select": rank_select, "neg_slider": neg_slider, "pos_slider": pos_slider, - "pval_spinner": pval_spinner, "help_button": help_button(title="Correlation", text=help_text)} diff --git a/grimer/prop.py b/grimer/prop.py deleted file mode 100644 index 0472bc1..0000000 --- a/grimer/prop.py +++ /dev/null @@ -1,79 +0,0 @@ -import numpy as np - - -def vlr(x, y): - return np.var(x - y, ddof=1) # - 2 * np.cov(x, y, ddof=1)[1, 0] + np.var(x, ddof=1) + np.var(y, ddof=1) - - -def phi(x, y): - return vlr(x, y) / np.var(y, ddof=1) - - -def rho(x,y): - return 1 - (vlr(x, y) / (np.var(x, ddof=1) + np.var(y, ddof=1))) - - -def phs(x, y): - r = rho(x, y) - return (1 - r) / (1 + r) - - -def get_prop_matrix(mat, func): - r, c = mat.shape - corr_mat = np.zeros((c, c)) - for i in range(c): - for j in range(c): - corr_mat[i, j] = func(mat[:, i], mat[:, j]) - return corr_mat - - -def pairwise_vlr(mat): - cov = np.cov(mat.T, ddof=1) - diagonal = np.diagonal(cov) - return -2 * cov + diagonal[:, np.newaxis] + diagonal - - -def pairwise_phi(mat): - return pairwise_vlr(mat) / np.var(mat, axis=0, ddof=1) - - -def pairwise_rho(mat): - variances = np.var(mat, axis=0, ddof=1) - return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) - - -def pairwise_phs(mat): - r = pairwise_rho(mat) - return (1 - r) / (1 + r) - - -# from skbio.stats.composition import clr -# counts = np.array([[12,2,3,4],[5,6,7,8],[9,11,12,13]])# print(get_prop_matrix(counts, vlr, np.log)) -# counts = clr(counts) - -# print(get_prop_matrix(counts, vlr)) -# print(get_prop_matrix(counts, phi)) -# print(get_prop_matrix(counts, phs)) -# print(get_prop_matrix(counts, rho)) - -# print(pairwise_vlr(counts)) -# print(pairwise_phi(counts)) -# print(pairwise_phs(counts)) -# print(pairwise_rho(counts)) - -# # compare to each other -# print(np.isclose(get_prop_matrix(counts, vlr), pairwise_vlr(counts)).all()) -# print(np.isclose(get_prop_matrix(counts, phi), pairwise_phi(counts)).all()) -# print(np.isclose(get_prop_matrix(counts, phs), pairwise_phs(counts)).all()) -# print(np.isclose(get_prop_matrix(counts, rho), pairwise_rho(counts)).all()) - -# # compare to propr -# from rpy2.robjects.packages import importr -# from rpy2.robjects import numpy2ri -# propr = importr("propr") -# numpy2ri.activate() - -# print(np.isclose(propr.lr2vlr(counts), pairwise_vlr(counts)).all()) -# print(np.isclose(propr.lr2phi(counts), pairwise_phi(counts)).all()) -# print(np.isclose(propr.lr2phs(counts), pairwise_phs(counts)).all()) -# print(np.isclose(propr.lr2rho(counts), pairwise_rho(counts)).all()) diff --git a/grimer/utils.py b/grimer/utils.py index 8dbf1c6..ac21cdd 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -250,28 +250,6 @@ def parse_single_table(table_df, ranks, tax, default_rank_name): return ranked_tables, lineage -def fdrcorrection_bh(pvals): - """ - Correct multiple p-values with the Benjamini/Hochberg method - Code copied and adapted from: statsmodels.stats.multitest.multipletests - https://github.com/statsmodels/statsmodels/blob/77bb1d276c7d11bc8657497b4307aa7575c3e65c/statsmodels/stats/multitest.py - """ - pvals_sortind = np.argsort(pvals) - pvals_sorted = np.take(pvals, pvals_sortind) - - nobs = len(pvals_sorted) - factors = np.arange(1, nobs + 1) / float(nobs) - - pvals_corrected_raw = pvals_sorted / factors - pvals_corrected = np.minimum.accumulate(pvals_corrected_raw[::-1])[::-1] - pvals_corrected[pvals_corrected > 1] = 1 - - pvals_corrected_ = np.empty_like(pvals_corrected) - pvals_corrected_[pvals_sortind] = pvals_corrected - - return pvals_corrected_ - - def transform_table(df, total_counts, transformation, replace_zero_value): # Special case clr with one observation (result in zeros) if transformation == "clr" and df.shape[1] == 1: @@ -545,6 +523,17 @@ def parse_controls(cfg, table): return controls, control_samples +def pairwise_vlr(mat): + cov = np.cov(mat.T, ddof=1) + diagonal = np.diagonal(cov) + return -2 * cov + diagonal[:, np.newaxis] + diagonal + + +def pairwise_rho(mat): + variances = np.var(mat, axis=0, ddof=1) + return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) + + def run_cmd(cmd, print_stderr: bool=False, exit_on_error: bool=True): errcode = 0 stdout = "" From 751b90bcacaf7d36556cd722b7123be371287ca3 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 2 Nov 2021 11:06:30 +0100 Subject: [PATCH 11/50] filter after cumulative counts --- grimer/grimer.py | 20 ++++++++++++-------- grimer/utils.py | 18 +++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/grimer/grimer.py b/grimer/grimer.py index 33eec25..e9949be 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -119,7 +119,7 @@ def main(): args.level_separator = ";" args.transpose = True - table_df, total, unassigned = parse_input_table(args.input_file, args.unassigned_header, args.transpose, args.min_frequency, args.max_frequency, args.min_count, args.max_count) + table_df, total, unassigned = parse_input_table(args.input_file, args.unassigned_header, args.transpose) if args.level_separator: ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace) else: @@ -131,19 +131,23 @@ def main(): table = Table(table_df.index, total, unassigned) table.lineage = lineage - print_log("Samples: " + str(len(table.samples))) - print_log("Observations: ") + + print_log("") + print_log("Total valid samples: " + str(len(table.samples))) + print_log("") + for r, t in ranked_tables.items(): - print_log(" " + r + ":") + print_log("--- " + r + " ---") + filtered_trimmed_t = trim_table(filter_input_table(t, total, args.min_frequency, args.max_frequency, args.min_count, args.max_count)) if t.empty: - print_log("Skipping without valid entries") + print_log("No valid entries, skipping") else: # Trim table for empty zeros rows/cols - table.add_rank(r, trim_table(t)) - print_log(" " + str(len(table.observations(r))) + " observations") + table.add_rank(r, filtered_trimmed_t) + print_log("Total valid observations: " + str(len(table.observations(r)))) print_log("") - print_log("Total assigned (sum): " + str(table.total.sum())) + print_log("Total assigned (sum): " + str(table.total.sum() - table.unassigned.sum())) print_log("Total unassigned (sum): " + str(table.unassigned.sum())) print_log("") diff --git a/grimer/utils.py b/grimer/utils.py index ac21cdd..d177198 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -5,8 +5,6 @@ import subprocess import shlex import pandas as pd -import yaml -from collections import OrderedDict #Internal from grimer.decontam import Decontam @@ -23,7 +21,7 @@ import scipy.cluster.hierarchy as sch -def parse_input_table(input_file, unassigned_header, transpose, min_frequency, max_frequency, min_count, max_count): +def parse_input_table(input_file, unassigned_header, transpose): if input_file.endswith(".biom"): with open(input_file, "r") as f: @@ -63,8 +61,8 @@ def parse_input_table(input_file, unassigned_header, transpose, min_frequency, m print_log("No unassigned entries defined") print_log("") - print_log("- Filtering table") - table_df = trim_table(filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count)) + print_log("- Trimming table") + table_df = trim_table(table_df) # Filter based on the table unassigned = unassigned.reindex(table_df.index) @@ -118,15 +116,15 @@ def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, def trim_table(table_df): # Check for cols/rows with sum zero - zero_rows = table_df.sum(axis=1) == 0 + zero_rows = table_df.sum(axis=1).eq(0) if any(zero_rows): table_df = table_df.loc[~zero_rows, :] - print_log(str(sum(zero_rows)) + " samples without valid counts removed") + print_log(str(sum(zero_rows)) + " samples with only zero removed") - zero_cols = table_df.sum(axis=0) == 0 + zero_cols = table_df.sum(axis=0).eq(0) if any(zero_cols): table_df = table_df.loc[:, ~zero_cols] - print_log(str(sum(zero_cols)) + " observations without valid counts removed") + print_log(str(sum(zero_cols)) + " observations with only zero removed") return table_df @@ -186,6 +184,8 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): if i > 0: lin_count = ranks_df.iloc[:, :i+1].drop_duplicates().groupby(r).count() invalid = lin_count[(lin_count > 1).any(axis=1)].index.to_list() + print(invalid) + print(ranks_df.loc[ranks_df[r].isin(invalid), r]) if invalid: print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r) # Set to NaN to keep shape of ranks_df From c7dcfde99c34021064380497a8b7aab3f82c20ff Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 10 Nov 2021 17:58:39 +0100 Subject: [PATCH 12/50] several bug fixes and general improvments --- README.md | 4 +-- grimer/callbacks.py | 81 +++++++++++++++++++++------------------------ grimer/grimer.py | 15 +++++++-- grimer/layout.py | 20 ++++++----- grimer/plots.py | 27 ++++++++------- grimer/utils.py | 18 +++++++--- 6 files changed, 91 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 4d2e5e2..0a11b8b 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ grimer -h ## Powered by -[](https://bokeh.org) +[](https://bokeh.org) [](https://pandas.org) -[](https://scipy.org) +[](https://scipy.org) [](https://scikit-bio.org) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 17cc1c8..83e7480 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -26,14 +26,13 @@ def link_obstable_samplebars(ele, y_range=ele["samplebars"]["fig"].y_range, max_total_count=max_total_count), code=''' - console.log("bar_select_callback"); const pdata = cds_p_samplebars.data; const ddata = cds_d_samples.data; const total = cds_d_samples.data["cnt|total"]; const key = "cnt|" + annotbar_rank_select.value + "|" + annotbar_select.value; - console.log(key); + if (y1_select.value=="%"){ for (var i = 0; i < total.length; ++i) { pdata['bar|selected'][i] = (ddata[key][i]/total[i])*100; @@ -57,14 +56,12 @@ def link_obstable_samplebars(ele, change_y_obs_label_callback = CustomJS( args=dict(yaxis=ele["samplebars"]["fig"].yaxis[1]), code=''' - console.log("change_y_obs_label_callback"); yaxis.axis_label = this.value + " observations"; ''') change_y_counts_label_callback = CustomJS( args=dict(yaxis=ele["samplebars"]["fig"].yaxis[0]), code=''' - console.log("change_y_counts_label_callback"); yaxis.axis_label = this.value + " counts"; ''') @@ -77,19 +74,23 @@ def link_obstable_samplebars(ele, cds_d_metadata=cds_d_metadata, samplebars=ele["samplebars"]["fig"]), code=''' - console.log("sort_groupby_callback"); - const samples = cds_d_samples.data["index"]; - // Define value from Sort by select var sort_col; + var annot_samples = cds_d_samples.data["index"]; if (sort_select.value=="input_order"){ sort_col = cds_d_samples.data["aux|input_order"]; - }else if (sort_select.value=="#"){ + }else if (sort_select.value=="counts"){ sort_col = cds_d_samples.data["cnt|total"]; }else if (sort_select.value=="selected_annotation"){ sort_col = cds_p_samplebars.data["bar|selected"]; }else if (sort_select.value.startsWith("metadata_num|")){ sort_col = cds_d_metadata.data[sort_select.value.replace('metadata_num|','')]; + + // Annotate label with value + var annot_samples = annot_samples.map(function(s, i) { + return sort_col[i] + " | " + s; + }); + }else if (sort_select.value.startsWith("tax|")){ sort_col = cds_p_samplebars.data[sort_select.value]; } @@ -103,7 +104,7 @@ def link_obstable_samplebars(ele, // Zip sample index and metadata field to create nested factors factors = groupby_col1.map(function(m, i) { - return [m, samples[i]]; + return [m, annot_samples[i]]; }); // second grouping level @@ -112,7 +113,7 @@ def link_obstable_samplebars(ele, var groupby_col2 = cds_d_metadata.data[groupby2_select.value.replace('metadata_cat|','')]; factors = groupby_col2.map(function(m, i) { - return [m, groupby_col1[i], samples[i]]; + return [m, groupby_col1[i], annot_samples[i]]; }); sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1, groupby_col2); @@ -122,7 +123,7 @@ def link_obstable_samplebars(ele, }else{ // Single factors, just use the sample index - factors = samples; + factors = annot_samples; sorted_factors = grimer_sort(factors, sort_col, "numeric", false); } @@ -139,13 +140,11 @@ def link_obstable_samplebars(ele, cds_d_samples=cds_d_samples, cds_p_obstable=cds_p_obstable, cds_d_sampleobs=cds_d_sampleobs, - samplebars=ele["samplebars"]["fig"], y_range=ele["samplebars"]["fig"].extra_y_ranges['obs'], min_obs_perc=min_obs_perc, max_total_count=max_total_count, active_ranks=active_ranks), code=''' - console.log("plot_obs_callback"); // get selected row from obstable [0 to get just the first] var row = cds_p_obstable.selected.indices[0]; if (row!=undefined){ @@ -200,7 +199,6 @@ def link_obstable_samplebars(ele, legend_obs=ele["samplebars"]["legend_obs"], active_ranks=active_ranks), code=''' - console.log("change_text_legend_obs_callback"); // selected row var row = cb_obj.indices[0]; for(let r = 0; r < active_ranks.length; r++){ @@ -218,7 +216,6 @@ def link_obstable_samplebars(ele, annotbar_rank_select=ele["samplebars"]["wid"]["annotbar_rank_select"], legend_bars=ele["samplebars"]["legend_bars"]), code=''' - console.log("change_text_legend_bars_callback"); legend_bars.items[0].label = annotbar_rank_select.value + "|" + annotbar_select.value; ''') @@ -227,8 +224,6 @@ def link_obstable_samplebars(ele, cds_p_obstable=cds_p_obstable, dict_d_refs=dict_d_refs), code=''' - console.log("load_infopanel"); - // selected row var row = cb_obj.indices[0]; const name = cds_p_obstable.data['col|name'][row]; @@ -270,7 +265,6 @@ def link_obstable_samplebars(ele, cds_d_decontam=cds_d_decontam, pvalue_input=ele["decontam"]["wid"]["pvalue_input"]), code=''' - console.log("decontam_callback"); // selected row const row = cb_obj.indices[0]; const taxid = cds_p_obstable.data["index"][row]; @@ -300,7 +294,6 @@ def link_obstable_samplebars(ele, cds_p_obstable=cds_p_obstable, cds_p_mgnify=cds_p_mgnify), code=''' - console.log("mgnify_callback"); // selected row const row = cds_p_obstable.selected.indices[0]; const indices = []; @@ -325,7 +318,6 @@ def link_obstable_samplebars(ele, cds_p_references=cds_p_references, active_ranks=active_ranks), code=''' - console.log("references_callback"); // selected row const row = cds_p_obstable.selected.indices[0]; const indices = []; @@ -347,11 +339,26 @@ def link_obstable_samplebars(ele, cds_p_references.change.emit(); ''') - obstable_callbacks = [plot_obs_callback, change_text_legend_obs_callback, sort_groupby_callback, load_infopanel, references_callback] + toggle_label_callback = CustomJS( + args=dict(xaxis=ele["samplebars"]["fig"].xaxis[0]), + code=''' + if(this.active.includes(0)){ + xaxis.major_label_text_font_size = "10px"; + xaxis.major_tick_line_color="black"; + }else{ + xaxis.major_label_text_font_size = "0px"; + xaxis.major_tick_line_color=null; + } + ''') + + obstable_callbacks = [plot_obs_callback, change_text_legend_obs_callback, sort_groupby_callback, load_infopanel] if cds_p_decontam: obstable_callbacks.append(decontam_callback) if cds_p_mgnify: obstable_callbacks.append(mgnify_callback) + if ele["references"]["filter"]: + obstable_callbacks.append(references_callback) + cds_p_obstable.selected.js_on_change('indices', *obstable_callbacks) ele["samplebars"]["wid"]["sort_select"].js_on_change('value', sort_groupby_callback) @@ -361,9 +368,11 @@ def link_obstable_samplebars(ele, ele["samplebars"]["wid"]["annotbar_rank_select"].js_on_change('value', bar_select_callback, change_text_legend_bars_callback, sort_groupby_callback) ele["samplebars"]["wid"]["y1_select"].js_on_change('value', bar_select_callback, change_y_counts_label_callback, sort_groupby_callback) ele["samplebars"]["wid"]["y2_select"].js_on_change('value', plot_obs_callback, change_y_obs_label_callback, sort_groupby_callback) + ele["samplebars"]["wid"]["toggle_label"].js_on_click(toggle_label_callback) ele["mgnify"]["wid"]["biome_spinner"].js_on_change('value', mgnify_callback) ele["references"]["wid"]["references_select"].js_on_change('value', references_callback) + def link_heatmap_widgets(ele, cds_d_samples, cds_d_metadata, @@ -384,7 +393,6 @@ def link_heatmap_widgets(ele, cds_p_dendro_x=cds_p_dendro_x, dict_d_dedro_x=dict_d_dedro_x), code=''' - console.log("x_dendro_callback"); if (x_sort_select.value.startsWith("metric|")){ const key = rank_select.value+"|"+x_method_select.value+"|"+x_sort_select.value.replace("metric|",""); cds_p_dendro_x.data = {"x": dict_d_dedro_x[key+"|x"], @@ -408,7 +416,6 @@ def link_heatmap_widgets(ele, cds_p_annotations=cds_p_annotations, cds_p_obstable=cds_p_obstable), code=''' - console.log("x_select_callback"); const rank = rank_select.value; var sorted_factors = []; if (x_sort_select.value=="none"){ @@ -467,7 +474,6 @@ def link_heatmap_widgets(ele, cds_p_dendro_y=cds_p_dendro_y, dict_d_dedro_y=dict_d_dedro_y), code=''' - console.log("y_dendro_callback"); if (y_sort_select.value.startsWith("metric|")){ const key = rank_select.value+"|"+y_method_select.value+"|"+y_sort_select.value.replace("metric|",""); cds_p_dendro_y.data = {"x": dict_d_dedro_y[key+"|x"], @@ -491,7 +497,6 @@ def link_heatmap_widgets(ele, y_sort_select=ele["heatmap"]["wid"]["y_sort_select"], dict_d_hcluster_y=dict_d_hcluster_y), code=''' - console.log("y_select_callback"); var sorted_factors = []; if (y_sort_select.value=="none"){ // None @@ -516,20 +521,20 @@ def link_heatmap_widgets(ele, heatmap.y_range.factors = sorted_factors; ''') - toggle_label_callback = CustomJS( + toggle_labels_callback = CustomJS( args=dict(cds_p_heatmap=cds_p_heatmap, xaxis=ele["heatmap"]["fig"].xaxis[0], yaxis=ele["heatmap"]["fig"].yaxis[0]), code=''' if(this.active.includes(0)){ - xaxis.major_label_text_font_size = "12px"; + xaxis.major_label_text_font_size = "10px"; xaxis.major_tick_line_color="black"; }else{ xaxis.major_label_text_font_size = "0px"; xaxis.major_tick_line_color=null; } if(this.active.includes(1)){ - yaxis.major_label_text_font_size = "12px"; + yaxis.major_label_text_font_size = "10px"; yaxis.major_tick_line_color="black"; }else{ yaxis.major_label_text_font_size = "0px"; @@ -537,7 +542,7 @@ def link_heatmap_widgets(ele, } ''') - ele["heatmap"]["wid"]["toggle_label_heatmap"].js_on_click(toggle_label_callback) + ele["heatmap"]["wid"]["toggle_labels"].js_on_click(toggle_labels_callback) ele["heatmap"]["wid"]["rank_select"].js_on_change('value', x_select_callback, x_dendro_callback, y_select_callback, y_dendro_callback) ele["heatmap"]["wid"]["x_method_select"].js_on_change('value', x_select_callback, x_dendro_callback) ele["heatmap"]["wid"]["x_sort_select"].js_on_change('value', x_select_callback, x_dendro_callback) @@ -553,7 +558,6 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols cds_d_metadata=cds_d_metadata), code=''' const index_len = cds_d_metadata.data["index"].length; - console.log(cds_d_metadata.data) var x_factors = []; var empty_y_values = new Array(index_len); for (var i = 0; i < index_len; ++i) empty_y_values[i]=["", ""]; @@ -564,7 +568,6 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols for (var i = 0; i < index_len; ++i){ y_values[i]=[selected, cds_d_metadata.data[selected][i].toString()]; } - console.log(y_values); cds_p_metadata.data["md" + s.toString()] = y_values; x_factors.push("md" + s.toString()); }else{ @@ -616,12 +619,8 @@ def link_obstable_filter(ele, cds_p_obstable, active_ranks): } indices.push(i); } - console.log(cds_p_obstable); - console.log(widgets_filter); widgets_filter.indices = indices; cds_p_obstable.change.emit(); - console.log(cds_p_obstable); - console.log(widgets_filter); ''') ele["obstable"]["wid"]["frequency_spinner"].js_on_change('value', filter_callback) ele["obstable"]["wid"]["counts_perc_avg_spinner"].js_on_change('value', filter_callback) @@ -634,7 +633,6 @@ def link_correlation_widgets(ele, cds_p_correlation): args=dict(correlation=ele["correlation"]["fig"], cds_p_correlation=cds_p_correlation), code=''' - console.log("rank_select_callback"); const factors = new Set(); for(let i = 0; i < cds_p_correlation.data["index"].length; i++){ if(cds_p_correlation.data["rank"][i]==this.value){ @@ -651,7 +649,6 @@ def link_correlation_widgets(ele, cds_p_correlation): pos_slider=ele["correlation"]["wid"]["pos_slider"], cds_p_correlation=cds_p_correlation), code=''' - console.log("filter_callback"); const indices = []; for (var i = 0; i < cds_p_correlation.data["index"].length; i++) { const rho = cds_p_correlation.data["rho"][i]; @@ -681,7 +678,6 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, cds_d_sampleobs, cds dict_d_taxname=dict_d_taxname, top_obs_bars=top_obs_bars), code=''' - console.log("rank_select_callback"); const rank = this.value; const n_sample = cds_p_obsbars.data["index"].length; const total = cds_d_samples.data["cnt|total"]; @@ -729,8 +725,6 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, cds_d_sampleobs, cds cds_d_samples=cds_d_samples, cds_d_metadata=cds_d_metadata), code=''' - console.log("sort_groupby_callback"); - // Define value from Sort by select var sort_col; var annot_samples = cds_d_samples.data["index"]; @@ -738,7 +732,6 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, cds_d_sampleobs, cds sort_col = cds_d_samples.data["aux|input_order"]; }else if (sort_select.value.startsWith("metadata_num|")){ sort_col = cds_d_metadata.data[sort_select.value.replace('metadata_num|','')]; - // Annotate label with value var annot_samples = annot_samples.map(function(s, i) { return sort_col[i] + " | " + s; @@ -791,11 +784,11 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, cds_d_sampleobs, cds args=dict(xaxis=ele["obsbars"]["fig"].xaxis[0]), code=''' if(this.active.includes(0)){ + xaxis.major_label_text_font_size = "10px"; + xaxis.major_tick_line_color="black"; + }else{ xaxis.major_label_text_font_size = "0px"; xaxis.major_tick_line_color=null; - }else{ - xaxis.major_label_text_font_size = "12px"; - xaxis.major_tick_line_color="black"; } ''') diff --git a/grimer/grimer.py b/grimer/grimer.py index e9949be..a1634d8 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -48,7 +48,8 @@ def main(): table_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hiearchical multi-level table where the observations headers are separated by the indicated separator characther (usually ';' or '|')") table_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table (if samples are listed on columns and observations on rows)") table_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.") - table_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on observations headers usin (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") + table_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on table observations labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") + table_group.add_argument('--sample-replace', nargs="*", type=str, default=[], help="Replace values on table sample labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") filter_group = parser.add_argument_group('Observation filter options') filter_group.add_argument('--min-frequency', type=float, help="Define minimum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") @@ -119,7 +120,7 @@ def main(): args.level_separator = ";" args.transpose = True - table_df, total, unassigned = parse_input_table(args.input_file, args.unassigned_header, args.transpose) + table_df, total, unassigned = parse_input_table(args.input_file, args.unassigned_header, args.transpose, args.sample_replace) if args.level_separator: ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace) else: @@ -134,6 +135,11 @@ def main(): print_log("") print_log("Total valid samples: " + str(len(table.samples))) + # Check for long sample headers, break some plots + long_sample_headers = [h for h in table_df.index if len(h)>70] + if long_sample_headers: + print_log("Long sample labels/headers detected, plots may break: ") + print_log("\n".join(long_sample_headers)) print_log("") for r, t in ranked_tables.items(): @@ -298,7 +304,10 @@ def main(): # references ele["references"] = {} - ele["references"]["fig"], ele["references"]["filter"] = plot_references(sizes, table, cds_p_references, dict_d_taxname) + if references: + ele["references"]["fig"], ele["references"]["filter"] = plot_references(sizes, table, cds_p_references, dict_d_taxname) + else: + ele["references"]["fig"], ele["references"]["filter"] = None, None ele["references"]["wid"] = plot_references_widgets(sizes, references) # mgnify diff --git a/grimer/layout.py b/grimer/layout.py index 0d9dab3..11c93f1 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -17,10 +17,12 @@ def make_layout(ele, sizes, version, logo_path, title): width=sizes["overview_top_panel_width_left"]) info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] - info_tabs.append(Panel(child=column(ele["references"]["fig"], - row(ele["references"]["wid"]["references_select"], - ele["references"]["wid"]["help_button"]) - ), title="References")) + + if ele["references"]["fig"]: + info_tabs.append(Panel(child=column(ele["references"]["fig"], + row(ele["references"]["wid"]["references_select"], + ele["references"]["wid"]["help_button"]) + ), title="References")) if ele["mgnify"]["fig"]: info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], @@ -53,7 +55,8 @@ def make_layout(ele, sizes, version, logo_path, title): ele["samplebars"]["wid"]["groupby2_select"], ele["samplebars"]["wid"]["sort_select"], ele["samplebars"]["wid"]["y2_select"], - ele["samplebars"]["wid"]["help_button"])) + ele["samplebars"]["wid"]["help_button"]), + ele["samplebars"]["wid"]["toggle_label"]) row_heatmap = gridplot([[ele["heatmap"]["fig"], ele["dendroy"]["fig"], ele["metadata"]["fig"]], [ele["dendrox"]["fig"]], @@ -62,8 +65,7 @@ def make_layout(ele, sizes, version, logo_path, title): merge_tools=True) row_heatmap_widgets = row(column(ele["heatmap"]["wid"]["rank_select"], - row(ele["heatmap"]["wid"]["toggle_label_text"], - ele["heatmap"]["wid"]["toggle_label_heatmap"]), + ele["heatmap"]["wid"]["toggle_labels"], sizing_mode="stretch_height", width=300), column(row(ele["heatmap"]["wid"]["x_sort_select"], @@ -87,12 +89,12 @@ def make_layout(ele, sizes, version, logo_path, title): ele["correlation"]["fig"]) row_obsbars = column(row(ele["obsbars"]["fig"]), - ele["obsbars"]["wid"]["toggle_label"], row(ele["obsbars"]["wid"]["rank_select"], ele["obsbars"]["wid"]["groupby1_select"], ele["obsbars"]["wid"]["groupby2_select"], ele["obsbars"]["wid"]["sort_select"], - ele["obsbars"]["wid"]["help_button"])) + ele["obsbars"]["wid"]["help_button"]), + ele["obsbars"]["wid"]["toggle_label"]) main_panels = [] main_panels.append(Panel(child=column(row_obstable, row_barpot, sizing_mode="stretch_width"), title="Overview")) diff --git a/grimer/plots.py b/grimer/plots.py index c37da48..d008d78 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -71,8 +71,10 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): legend_obs.orientation = "horizontal" legend_obs.location = "bottom_right" legend_obs.click_policy = "hide" + legend_obs.label_text_color = "#606c38" samplebars_fig.add_layout(legend_obs, "above") + samplebars_fig.xaxis.major_label_orientation = "vertical" samplebars_fig.xaxis.major_label_text_font_size = '0pt' samplebars_fig.xgrid.grid_line_color = None samplebars_fig.xaxis.major_tick_line_color = None @@ -85,6 +87,7 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): samplebars_fig.xaxis.axis_label = "samples" samplebars_fig.yaxis[0].axis_label = "# counts" samplebars_fig.yaxis[1].axis_label = "% observations" + samplebars_fig.yaxis[1].axis_label_text_color = "#606c38" return samplebars_fig, legend_obs, legend_bars @@ -128,7 +131,9 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna obsbars_fig.xaxis.major_label_orientation = "vertical" obsbars_fig.xaxis.group_label_orientation = "horizontal" obsbars_fig.xaxis.subgroup_label_orientation = "vertical" - obsbars_fig.xaxis.major_label_text_font_size = "8px" + obsbars_fig.xaxis.minor_tick_line_color = None + obsbars_fig.xaxis.major_tick_line_color = None + obsbars_fig.xaxis.major_label_text_font_size = "0px" obsbars_fig.xgrid.grid_line_color = None obsbars_fig.ygrid.grid_line_color = None obsbars_fig.xaxis.axis_label = "samples" @@ -202,7 +207,7 @@ def plot_obsbars_widgets(ranks, metadata, dict_d_topobs, dict_d_taxname, top_obs groupby1_select = Select(title="1) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") groupby2_select = Select(title="2) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") - toggle_label = CheckboxGroup(labels=["Hide sample labels"], active=[]) + toggle_label = CheckboxGroup(labels=["Show/Hide samples labels"], active=[]) help_text = """ Observation bars showing proportions of top """ + str(top_obs_bars) + """ most abundant observations. @@ -235,7 +240,7 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec y2_select = Select(title="Observations", value="%", options=["#", "%", "log10(#)", "log10(%)"], width=80) sort_options = {} - sort_options["Default"] = [("input_order", "input order"), ("#", "# counts"), ("selected_annotation", "selected annotation")] + sort_options["Default"] = [("input_order", "input order"), ("counts", "counts"), ("selected_annotation", "selected annotation")] sort_options["Selected Rank"] = [("tax|" + r, r) for r in ranks] sort_options["Numeric Metadata"] = [] if metadata: @@ -254,6 +259,8 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec groupby1_select = Select(title="1) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") groupby2_select = Select(title="2) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") + toggle_label = CheckboxGroup(labels=["Show/Hide samples labels"], active=[]) + help_text = """ Bars showing total counts (left y-axis) for each sample (x-axis). @@ -273,6 +280,7 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec "sort_select": sort_select, "groupby1_select": groupby1_select, "groupby2_select": groupby2_select, + "toggle_label": toggle_label, "help_button": help_button(title="Sample bars", text=help_text)} @@ -491,7 +499,7 @@ def plot_references(sizes, table, cds_p_references, dict_d_taxname): def plot_references_widgets(sizes, references): - references_select = Select(value=list(references.keys())[0], width=sizes["overview_top_panel_width_right"] - 70, options=list(references.keys())) + references_select = Select(value=list(references.keys())[0] if references else None, width=sizes["overview_top_panel_width_right"] - 70, options=list(references.keys())) help_text = """ Plot of number of occurences of provided references for each observation and its lineage. @@ -656,16 +664,13 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name if categorical_md_data: y_sort_options["Sort by Categorical Metadata"] = [("metadata_cat|" + md, md) for md in categorical_md_data] - x_sort_select = Select(title="Observation cluster/sort:", value="none", options=x_sort_options) x_method_select = Select(title="Observation clustering method:", value=linkage_methods[0], options=linkage_methods, disabled=True) - #x_group_select = Select(title="Observation group by:", value="none", options={"Default": ["none"], "Taxonomic ranks": ranks}) - + y_sort_select = Select(title="Sample cluster/sort:", value="none", options=y_sort_options) y_method_select = Select(title="Sample clustering method:", value=linkage_methods[0], options=linkage_methods, disabled=True) - toggle_label_text = Div(text="show/hide labels") - toggle_label_heatmap = CheckboxGroup(labels=["Observations", "Samples"], active=[]) + toggle_labels = CheckboxGroup(labels=["Show/Hide observations labels", "Show/Hide samples labels"], active=[]) help_text = """ The heatmap shows [transformed] values from the input table (color bar on top). If taxonomy is provided, one heatmap for each taxonomic rank is generated. @@ -682,11 +687,9 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name return {"rank_select": rank_select, "x_method_select": x_method_select, "x_sort_select": x_sort_select, - #"x_group_select": x_group_select, "y_method_select": y_method_select, "y_sort_select": y_sort_select, - "toggle_label_heatmap": toggle_label_heatmap, - "toggle_label_text": toggle_label_text, + "toggle_labels": toggle_labels, "help_button": help_button(title="Heatmap/Clustering", text=help_text)} diff --git a/grimer/utils.py b/grimer/utils.py index d177198..7ff1279 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -21,11 +21,12 @@ import scipy.cluster.hierarchy as sch -def parse_input_table(input_file, unassigned_header, transpose): +def parse_input_table(input_file, unassigned_header, transpose, sample_replace): if input_file.endswith(".biom"): - with open(input_file, "r") as f: + with open(input_file, encoding="utf8", errors='ignore') as f: table_df = parse_table_biom(f).to_dataframe(dense=True) + # biom convert -i feature-table.biom -o feature-table.biom.tsv --to-tsv else: # Default input_file: index=observations, columns=samples # table_df should have samples on indices and observations on columns @@ -38,6 +39,17 @@ def parse_input_table(input_file, unassigned_header, transpose): # Remove header on rows table_df.index.names = [None] + # Replace text on sample labels + if sample_replace: + print_log("Replacing sample label values:") + before_replace = table_df.head(1).index + #get index as series to use replace method + new_index = table_df.reset_index()["index"].replace(regex=dict(zip(sample_replace[::2], sample_replace[1::2]))) + table_df.set_index(new_index, inplace=True) + for b, a in zip(before_replace, table_df.head(1).index): + print_log(" " + b + " -> " + a) + print_log(" ...") + # Sum total before split unassigned or filter total = table_df.sum(axis=1) @@ -184,8 +196,6 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): if i > 0: lin_count = ranks_df.iloc[:, :i+1].drop_duplicates().groupby(r).count() invalid = lin_count[(lin_count > 1).any(axis=1)].index.to_list() - print(invalid) - print(ranks_df.loc[ranks_df[r].isin(invalid), r]) if invalid: print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r) # Set to NaN to keep shape of ranks_df From 31919ddbf203a0e01334180f5a020c7d512c03aa Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 10 Nov 2021 18:00:37 +0100 Subject: [PATCH 13/50] images of README --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0a11b8b..3efaaa7 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ grimer -h ## Powered by -[](https://bokeh.org) -[](https://pandas.org) -[](https://scipy.org) -[](https://scikit-bio.org) +[](https://bokeh.org) +[](https://pandas.org) +[](https://scipy.org) +[](https://scikit-bio.org) From b6059af09ca9b85f49113c5081a9295de30f916f Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 10 Nov 2021 18:01:13 +0100 Subject: [PATCH 14/50] images of README --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 3efaaa7..732159a 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ grimer -h ## Powered by -[](https://bokeh.org) -[](https://pandas.org) -[](https://scipy.org) -[](https://scikit-bio.org) +[](https://bokeh.org) +[](https://pandas.org) +[](https://scipy.org) +[](https://scikit-bio.org) From 80e90d0ebfa3950ae0cf4f7d58823110fb5c518f Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 25 Jan 2022 16:13:14 +0100 Subject: [PATCH 15/50] all numerics to float on metadata heatmap, fix mismatching values on py js --- grimer/callbacks.py | 15 ++++++++--- grimer/cds.py | 5 ++-- grimer/decontam.py | 6 ++--- grimer/grimer.py | 2 +- grimer/metadata.py | 10 ++------ grimer/mgnify.py | 2 -- grimer/plots.py | 61 +++++++++++++-------------------------------- grimer/reference.py | 2 +- grimer/utils.py | 37 ++++++++++++++++++++++++++- 9 files changed, 75 insertions(+), 65 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 83e7480..478fc43 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -566,12 +566,19 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols var selected = this.value[s]; var y_values = new Array(index_len); for (var i = 0; i < index_len; ++i){ - y_values[i]=[selected, cds_d_metadata.data[selected][i].toString()]; + var val = cds_d_metadata.data[selected][i]; + // fix conversion from float to integer (30.0).toString() = "30" + if (Number.isInteger(val)){ + val = val + ".0"; + } else { + val = val.toString(); + } + y_values[i]=[selected, val]; } - cds_p_metadata.data["md" + s.toString()] = y_values; - x_factors.push("md" + s.toString()); + cds_p_metadata.data[(s+1).toString()] = y_values; + x_factors.push((s+1).toString()); }else{ - cds_p_metadata.data["md" + s.toString()] = empty_y_values; + cds_p_metadata.data[(s+1).toString()] = empty_y_values; } } metadata_heatmap.x_range.factors = x_factors; diff --git a/grimer/cds.py b/grimer/cds.py index 75ddd60..5809efc 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -4,7 +4,7 @@ from math import pi #Internal -from grimer.utils import print_df, transform_table, print_log, pairwise_rho +from grimer.utils import print_df, transform_table, print_log, pairwise_rho, format_js_toString #Bokeh from bokeh.models import ColumnDataSource @@ -212,7 +212,8 @@ def generate_cds_plot_metadata(metadata, max_metadata_cols): metadata_fields = metadata.get_col_headers().to_list() for i in range(max_metadata_cols): # Same transformation done in the colormap for numeric entries - df_plot_md["md" + str(i)] = [(metadata_fields[i], '{:.16g}'.format(md_value) if not isinstance(md_value, str) else md_value) for md_value in metadata.get_col(metadata_fields[i])] + #df_plot_md[str(i+1)] = [(metadata_fields[i], '{:.16g}'.format(md_value) if not isinstance(md_value, str) else md_value) for md_value in metadata.get_col(metadata_fields[i])] + df_plot_md[str(i + 1)] = [(metadata_fields[i], format_js_toString(md_value)) for md_value in metadata.get_col(metadata_fields[i])] print_df(df_plot_md, "df_plot_md -> cds_p_metadata") return ColumnDataSource(df_plot_md) diff --git a/grimer/decontam.py b/grimer/decontam.py index b8853b4..7cd6fc9 100644 --- a/grimer/decontam.py +++ b/grimer/decontam.py @@ -17,13 +17,13 @@ def add_rank_results(self, rank, decontam_out_file, decontam_mod_file): # Parse models enforcing index as string mod = pd.read_table(decontam_mod_file, sep='\t', header=0, skiprows=0, index_col=0, dtype={0: str}) - + # Remove point counter at end (.1 or .1000) mod.index = mod.index.map(lambda txid: txid[:-5] if txid.endswith(".1000") else txid[:-2]).to_list() - + # Merge first point of model self.rank[rank] = self.rank[rank].merge(mod.iloc[0::2, 0], left_index=True, right_index=True) - + # Merge second point of model and non-contant line self.rank[rank] = self.rank[rank].merge(mod.iloc[1::2, :], suffixes=["", "_2"], left_index=True, right_index=True) diff --git a/grimer/grimer.py b/grimer/grimer.py index a1634d8..9eb2c3a 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -136,7 +136,7 @@ def main(): print_log("") print_log("Total valid samples: " + str(len(table.samples))) # Check for long sample headers, break some plots - long_sample_headers = [h for h in table_df.index if len(h)>70] + long_sample_headers = [h for h in table_df.index if len(h) > 70] if long_sample_headers: print_log("Long sample labels/headers detected, plots may break: ") print_log("\n".join(long_sample_headers)) diff --git a/grimer/metadata.py b/grimer/metadata.py index 4399ade..8eabb4c 100644 --- a/grimer/metadata.py +++ b/grimer/metadata.py @@ -9,7 +9,7 @@ class Metadata: def __init__(self, metadata_file, samples: list=[]): # Read metadata and let pandas guess dtypes, index as str - self.data = pd.read_table(metadata_file, sep='\t', header=0, skiprows=0, index_col=0, dtype={0:str}) + self.data = pd.read_table(metadata_file, sep='\t', header=0, skiprows=0, index_col=0, dtype={0: str}) # Enforce string index self.data.index = self.data.index.astype('str') @@ -85,13 +85,7 @@ def get_col(self, col): return self.data[col] def get_unique_values(self, col): - return sorted(self.get_col(col).dropna().unique()) - - def get_formatted_unique_values(self, col): - if self.types[col] == "categorical": - return self.get_unique_values(col) - else: - return list(map('{:.16g}'.format, self.get_unique_values(col))) + return self.get_col(col).dropna().unique() def get_type(self, col): return self.types[col] diff --git a/grimer/mgnify.py b/grimer/mgnify.py index 84d2eb5..db8b974 100644 --- a/grimer/mgnify.py +++ b/grimer/mgnify.py @@ -23,5 +23,3 @@ def parse(self, file, ranks): def update_taxids(self, taxid_updated): # Update taxonomy to taxid or keep name if not available self.data["taxa"] = self.data[["rank", "taxa"]].apply(lambda rt: taxid_updated[(rt[0], rt[1])] if taxid_updated[(rt[0], rt[1])] is not None else rt[1], axis=1) - - diff --git a/grimer/plots.py b/grimer/plots.py index d008d78..3b8b8fa 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -1,12 +1,14 @@ import markdown + # Bokeh -from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, Div, FactorRange, FuncTickFormatter, HoverTool, Legend, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, Range1d, RangeSlider, Select, Spinner, Tabs, TextAreaInput, TextInput +from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, FactorRange, FuncTickFormatter, HoverTool, Legend, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, Range1d, RangeSlider, Select, Spinner, Tabs, TextAreaInput, TextInput from bokeh.models.filters import IndexFilter, GroupFilter from bokeh.models.widgets import DataTable, TableColumn -from bokeh.palettes import Blues, Category10, Category20, Colorblind, Dark2, linear_palette, Magma256, Reds, Turbo256 +from bokeh.palettes import Blues, Dark2, Magma256, Reds from bokeh.plotting import figure from bokeh.transform import cumsum, factor_cmap +from grimer.utils import format_js_toString, make_color_palette def plot_samplebars(cds_p_samplebars, max_total_count, ranks): # Bar plots has 3 main stacks: selection, others, unassigned @@ -44,7 +46,7 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): for i, rank in enumerate(ranks): ren = samplebars_fig.scatter(x="aux|factors", y="tax|" + rank, y_range_name="obs", - name="tax|" + rank, #to work with hover properly + name="tax|" + rank, # to work with hover properly source=cds_p_samplebars, marker="circle", size=7, line_color="navy", alpha=0.6, fill_color=obs_palette[i]) @@ -737,10 +739,10 @@ def plot_dendrogram(heatmap, tools_heatmap, cds_p_dendro_x, cds_p_dendro_y): def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metadata): - # Number of cols added to the plot cds (user input) - ncols = len(cds_p_metadata.data) - 1 + # Get fixed headers from cds + cols = list(cds_p_metadata.data.keys())[1:] - metadata_fig = figure(x_range=["md" + str(c) for c in range(ncols)], + metadata_fig = figure(x_range=cols, y_range=heatmap.y_range, tools=tools_heatmap, x_axis_location="above", @@ -760,32 +762,33 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada factors = [] palette = [] for i, md_header in enumerate(metadata_fields): - unique_values = metadata.get_unique_values(md_header) + unique_values = sorted(metadata.get_unique_values(md_header)) if unique_values: n = len(unique_values) if metadata.get_type(md_header) == "numeric": unique_palette = make_color_palette(n, linear=True) + unique_values = map(format_js_toString, unique_values) else: unique_palette = make_color_palette(n) assert len(unique_palette) == n, 'Wrong number of colors on palette' palette.extend(unique_palette) - factors.extend([(md_header, md_value) for md_value in metadata.get_formatted_unique_values(md_header)]) + factors.extend([(md_header, md_value) for md_value in unique_values]) metadata_colormap = CategoricalColorMapper(palette=palette, factors=factors) # Custom tooltip to show metadata field and value md_custom = CustomJSHover(code='return value[0] ? "(" + value[0] + ") " + value[1] : "";') tooltips = [('Sample', '@index')] formatters = {} - for i in range(ncols): - tooltips.append(("md" + str(i), "@md" + str(i) + "{custom}")) - formatters["@md" + str(i)] = md_custom + for col in cols: + tooltips.append((col, "@" + col + "{custom}")) + formatters["@" + col] = md_custom metadata_fig.add_tools(HoverTool(tooltips=tooltips, formatters=formatters)) - for i in range(ncols): - metadata_fig.rect(x=i + 0.5, y="index", + for col in cols: + metadata_fig.rect(x=dict(value=col), y="index", width=1, height=1, source=cds_p_metadata, - fill_color={'field': "md" + str(i), 'transform': metadata_colormap}, + fill_color={'field': col, 'transform': metadata_colormap}, line_color=None) metadata_fig.xaxis.axis_label = "metadata" @@ -799,7 +802,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada metadata_fig.yaxis.axis_line_color = None metadata_fig.ygrid.grid_line_color = None - metadata_multiselect = MultiSelect(title="Metadata to show ('md#', max. " + str(ncols) + "):", value=[metadata_fields[c] for c in range(ncols)], options=metadata_fields) + metadata_multiselect = MultiSelect(title="Metadata to show (max. " + str(len(cols)) + " columns):", value=[metadata_fields[c] for c in range(len(cols))], options=metadata_fields) return metadata_fig, {"metadata_multiselect": metadata_multiselect} @@ -940,31 +943,3 @@ def help_button(title: str="", text: str="", align: str="end"): hb.js_on_click(CustomJS(code="pop.open('" + title + "', '" + html_text + "');")) return hb - -def make_color_palette(n_colors, linear: bool=False, palette: dict=None): - if isinstance(palette, dict) and n_colors <= max(palette.keys()): - # Special case for 1 and 2 (not in palettes) - palette = palette[3 if n_colors < 3 else n_colors] - - if linear or n_colors > 20: - if not palette: - palette = Turbo256 - if n_colors <= 256: - return linear_palette(palette, n_colors) - else: - # Repeat colors - return [palette[int(i * 256.0 / n_colors)] for i in range(n_colors)] - else: - # Select color palette based on number of requested colors - # Return the closest palette with most distinc set of colors - if not palette: - if n_colors <= 8: - palette = Colorblind[8] - elif n_colors <= 10: - palette = Category10[10] - elif n_colors <= 20: - palette = Category20[20] - else: - palette = Turbo256 - - return palette[:n_colors] diff --git a/grimer/reference.py b/grimer/reference.py index a7e99a7..82c8e62 100644 --- a/grimer/reference.py +++ b/grimer/reference.py @@ -49,7 +49,7 @@ def update_taxids(self, taxid_updated): print("Updated taxonomic node: " + node + " -> " + upd_node) self.add(upd_node) self.ids[upd_node].update(self.ids[node]) - self.ids.discard(node) + del self.ids[node] def get_refs_desc(self, i, direct: bool=False, parents: bool=False): refs_desc = {} diff --git a/grimer/utils.py b/grimer/utils.py index 7ff1279..7e26c81 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -8,9 +8,11 @@ #Internal from grimer.decontam import Decontam -from grimer.plots import make_color_palette from grimer.reference import Reference +# Bokeh +from bokeh.palettes import Blues, Category10, Category20, Colorblind, Dark2, linear_palette, Magma256, Reds, Turbo256 + #biom from biom import parse_table as parse_table_biom @@ -544,6 +546,39 @@ def pairwise_rho(mat): return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) +def format_js_toString(val): + # Transform numeric value to float and string to match toString + return str(float(val)) if isinstance(val, (int, float)) else str(val) + + +def make_color_palette(n_colors, linear: bool=False, palette: dict=None): + if isinstance(palette, dict) and n_colors <= max(palette.keys()): + # Special case for 1 and 2 (not in palettes) + palette = palette[3 if n_colors < 3 else n_colors] + + if linear or n_colors > 20: + if not palette: + palette = Turbo256 + if n_colors <= 256: + return linear_palette(palette, n_colors) + else: + # Repeat colors + return [palette[int(i * 256.0 / n_colors)] for i in range(n_colors)] + else: + # Select color palette based on number of requested colors + # Return the closest palette with most distinc set of colors + if not palette: + if n_colors <= 8: + palette = Colorblind[8] + elif n_colors <= 10: + palette = Category10[10] + elif n_colors <= 20: + palette = Category20[20] + else: + palette = Turbo256 + + return palette[:n_colors] + def run_cmd(cmd, print_stderr: bool=False, exit_on_error: bool=True): errcode = 0 stdout = "" From d1fb8e11a24b5f376c3f6ff29366e800dc3016cf Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 27 Jan 2022 10:16:51 +0100 Subject: [PATCH 16/50] small fix readme files --- LICENSE | 2 +- env.yaml | 4 +--- files/README.md | 26 +++++++++++++++----------- scripts/.Rhistory | 0 setup.py | 4 ++-- 5 files changed, 19 insertions(+), 17 deletions(-) delete mode 100644 scripts/.Rhistory diff --git a/LICENSE b/LICENSE index 993f608..adf1d0c 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2021 pirovc.github.io +Copyright (c) 2022 pirovc.github.io Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/env.yaml b/env.yaml index ad3dca1..1bdbfaa 100644 --- a/env.yaml +++ b/env.yaml @@ -15,6 +15,4 @@ dependencies: - bioconductor-decontam==1.10.0 #DECONTAM - r-optparse==1.6.6 #DECONTAM - biom-format>=2.1.10 #biom - - jsonapi-client>=0.9.7 #mgnify scripts -# - r-propr #propr -# - rpy2 #propr \ No newline at end of file + - jsonapi-client>=0.9.7 #mgnify scripts \ No newline at end of file diff --git a/files/README.md b/files/README.md index 8002000..8219e7f 100644 --- a/files/README.md +++ b/files/README.md @@ -4,24 +4,28 @@ 1) File with a list (one per line) of taxonomic identifiers or taxonomic names -or +2) or formatted `.yml` file: -2) Formatted `.yml` file: - "General Description": - "Specific description": - url: "www.website.com?id={}" - ids: [1,2,3] +```yaml +"General Description": + "Specific description": + url: "www.website.com?id={}" + ids: [1,2,3] +``` + The url can be a link to the entries listed on the id. Use the `{}` as a placeholder for the id. Example: `https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={}` The files should be provided in the main configuration file for grimer as follows: - references: - "Contaminants": "files/contaminants.yml" - "Human-related": "files/human-related.yml" - "CUSTOM CONTAMINANTS": "file.txt" - "LAB RELATED BACTERIA": "another_file.yml" +```yaml +references: + "Contaminants": "files/contaminants.yml" + "Human-related": "files/human-related.yml" + "CUSTOM CONTAMINANTS": "file.txt" + "LAB RELATED BACTERIA": "another_file.yml" +``` ### contaminants.yml diff --git a/scripts/.Rhistory b/scripts/.Rhistory deleted file mode 100644 index e69de29..0000000 diff --git a/setup.py b/setup.py index 8d8465f..4c37b11 100755 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ def read(filename): setup( name="grimer", - version="0.0.0", + version="1.0.0-alpha1", url="https://www.github.com/pirovc/grimer", license='MIT', @@ -24,7 +24,7 @@ def read(filename): long_description=read("README.md"), packages=['grimer'], - #install_requires=['binpacking==1.4.3'], + #install_requires=['bokeh==2.2.3','pandas','numpy','scipy','multitax'], include_package_data=True, package_data={ From 422ebb9f244f3f75586eda9fa799c4e73a481611 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 27 Jan 2022 10:17:13 +0100 Subject: [PATCH 17/50] readme files --- files/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/files/README.md b/files/README.md index 8219e7f..5e74fba 100644 --- a/files/README.md +++ b/files/README.md @@ -6,7 +6,6 @@ 2) or formatted `.yml` file: - ```yaml "General Description": "Specific description": @@ -14,7 +13,6 @@ ids: [1,2,3] ``` - The url can be a link to the entries listed on the id. Use the `{}` as a placeholder for the id. Example: `https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id={}` The files should be provided in the main configuration file for grimer as follows: From 4ae1f2430e1659ae9fad1fe01dd033346aa109ba Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 27 Jan 2022 15:59:41 +0100 Subject: [PATCH 18/50] sparse cds_d_sampleobs replace by dict_d_sampleobs --- grimer/callbacks.py | 52 ++++++++++++++++++++++++++++----------------- grimer/cds.py | 17 ++++++++++++++- grimer/grimer.py | 7 +++--- 3 files changed, 52 insertions(+), 24 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 478fc43..d41da2c 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -5,7 +5,7 @@ def link_obstable_samplebars(ele, cds_p_obstable, cds_p_samplebars, cds_d_samples, - cds_d_sampleobs, + dict_d_sampleobs, cds_d_metadata, cds_p_decontam, cds_p_decontam_models, @@ -139,7 +139,7 @@ def link_obstable_samplebars(ele, cds_p_samplebars=cds_p_samplebars, cds_d_samples=cds_d_samples, cds_p_obstable=cds_p_obstable, - cds_d_sampleobs=cds_d_sampleobs, + dict_d_sampleobs=dict_d_sampleobs, y_range=ele["samplebars"]["fig"].extra_y_ranges['obs'], min_obs_perc=min_obs_perc, max_total_count=max_total_count, @@ -152,23 +152,24 @@ def link_obstable_samplebars(ele, const total = cds_d_samples.data["cnt|total"]; // for each rank for(let r = 0; r < active_ranks.length; r++){ + // get rank + let rank = active_ranks[r]; // get taxid of the rank - let rank_taxid = cds_p_obstable.data["tax|"+active_ranks[r]][row]; + let taxid = cds_p_obstable.data["tax|"+rank][row]; // for each sample - for (var i = 0; i < cds_d_sampleobs.length; i++) { + for (var i = 0; i < cds_d_samples.length; i++) { + let sample = cds_d_samples.data["index"][i]; let val = 0; - // if taxid for the rank exists, [transform and] copy values over to the cds_p_samplebars - if (rank_taxid){ - val = cds_d_sampleobs.data[rank_taxid][i]; + // if taxid exists in the lineage, [transform and] copy values over to the cds_p_samplebars + if (taxid){ + val = dict_d_sampleobs[rank][taxid][sample]; if(val>0){ - if (y2_select.value=="#"){ - val = cds_d_sampleobs.data[rank_taxid][i]; - }else if (y2_select.value=="%"){ - val = (cds_d_sampleobs.data[rank_taxid][i]/total[i])*100; + if (y2_select.value=="%"){ + val = (val/total[i])*100; }else if (y2_select.value=="log10(%)"){ - val = Math.log10((cds_d_sampleobs.data[rank_taxid][i]/total[i])*100); + val = Math.log10((val/total[i])*100); }else if (y2_select.value=="log10(#)"){ - val = Math.log10(cds_d_sampleobs.data[rank_taxid][i]); + val = Math.log10(val); } } } @@ -259,7 +260,7 @@ def link_obstable_samplebars(ele, decontam_callback = CustomJS( args=dict(cds_d_samples=cds_d_samples, cds_p_obstable=cds_p_obstable, - cds_d_sampleobs=cds_d_sampleobs, + dict_d_sampleobs=dict_d_sampleobs, cds_p_decontam=cds_p_decontam, cds_p_decontam_models=cds_p_decontam_models, cds_d_decontam=cds_d_decontam, @@ -268,9 +269,15 @@ def link_obstable_samplebars(ele, // selected row const row = cb_obj.indices[0]; const taxid = cds_p_obstable.data["index"][row]; + const rank = cds_p_obstable.data["col|rank"][row]; const total = cds_d_samples.data["cnt|total"]; - for(let i = 0; i < cds_p_decontam.data["counts"].length; i++){ - cds_p_decontam.data["counts"][i] = cds_d_sampleobs.data[taxid][i]/total[i]; + for(let i = 0; i < cds_p_decontam.length; i++){ + let sample = cds_p_decontam.data["index"][i]; + if (dict_d_sampleobs[rank][taxid][sample]!=undefined){ + cds_p_decontam.data["counts"][i] = dict_d_sampleobs[rank][taxid][sample]/total[i]; + }else{ + cds_p_decontam.data["counts"][i] = 0; + } } cds_p_decontam.change.emit(); @@ -674,12 +681,12 @@ def link_correlation_widgets(ele, cds_p_correlation): ele["correlation"]["wid"]["rank_select"].js_on_change('value', rank_select_callback) -def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, cds_d_sampleobs, cds_d_samples, top_obs_bars, dict_d_taxname, cds_d_metadata): +def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cds_d_samples, top_obs_bars, dict_d_taxname, cds_d_metadata): rank_select_callback = CustomJS( args=dict(sort_select=ele["obsbars"]["wid"]["sort_select"], legend=ele["obsbars"]["legend"], cds_p_obsbars=cds_p_obsbars, - cds_d_sampleobs=cds_d_sampleobs, + dict_d_sampleobs=dict_d_sampleobs, cds_d_samples=cds_d_samples, dict_d_topobs=dict_d_topobs, dict_d_taxname=dict_d_taxname, @@ -703,9 +710,14 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, cds_d_sampleobs, cds var sum_bars = 0; if (taxid!=undefined){ for(let s = 0; s 0: + dict_sampleobs[rank][obs][sample] = val + + print_df(dict_sampleobs, "dict_sampleobs -> dict_d_sampleobs") + return dict_sampleobs + + def generate_cds_sampleobs(table): # matrix-like cds with raw counts # index -> sample-ids @@ -269,7 +285,6 @@ def generate_cds_sampleobs(table): df_sampleobs = pd.DataFrame(index=table.samples) for rank in table.ranks(): df_sampleobs = pd.concat([df_sampleobs, table.data[rank]], axis=1) - # fill NaN with zero so bars do not "dissapear" when plotting df_sampleobs.fillna(0, inplace=True) print_df(df_sampleobs, "df_sampleobs -> cds_d_sampleobs") diff --git a/grimer/grimer.py b/grimer/grimer.py index 9eb2c3a..19265b3 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -261,7 +261,8 @@ def main(): # _d_ # matrix: index (unique sample-ids), columns (unique observations) -> raw counts - cds_d_sampleobs = generate_cds_sampleobs(table) + #cds_d_sampleobs = generate_cds_sampleobs(table) + dict_d_sampleobs = generate_dict_sampleobs(table) # df: index (unique sample-ids), aux|..., cnt|..., cds_d_samples = generate_cds_samples(table, references, controls, decontam) # matrix: index (unique sample-ids) x columns (metadata fields) -> metadata values @@ -378,7 +379,7 @@ def main(): cds_p_obstable, cds_p_samplebars, cds_d_samples, - cds_d_sampleobs, + dict_d_sampleobs, cds_d_metadata, cds_p_decontam, cds_p_decontam_models, @@ -410,7 +411,7 @@ def main(): link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, - cds_d_sampleobs, + dict_d_sampleobs, cds_d_samples, args.top_obs_bars, dict_d_taxname, From 4b0c8c32fb383c9b6771399c0e2cd7e107587448 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 27 Jan 2022 18:06:37 +0100 Subject: [PATCH 19/50] tooltip obs on obsbars --- grimer/grimer.py | 2 +- grimer/plots.py | 29 +++++++++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/grimer/grimer.py b/grimer/grimer.py index 19265b3..95b5a62 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -368,8 +368,8 @@ def main(): # obsbars ele["obsbars"] = {} - ele["obsbars"]["fig"], ele["obsbars"]["legend"] = plot_obsbars(cds_p_obsbars, dict_d_topobs, table.ranks(), args.top_obs_bars, dict_d_taxname) ele["obsbars"]["wid"] = plot_obsbars_widgets(table.ranks(), metadata, dict_d_topobs, dict_d_taxname, args.top_obs_bars) + ele["obsbars"]["fig"], ele["obsbars"]["legend"] = plot_obsbars(cds_p_obsbars, dict_d_topobs, table.ranks(), args.top_obs_bars, dict_d_taxname, ele["obsbars"]["wid"]["rank_select"]) ############ JAVASCRIPT LINKING diff --git a/grimer/plots.py b/grimer/plots.py index 3b8b8fa..4871f3c 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -94,7 +94,7 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): return samplebars_fig, legend_obs, legend_bars -def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxname): +def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxname, rank_select): obsbars_fig = figure(x_range=FactorRange(factors=cds_p_obsbars.data["factors"]), y_range=Range1d(start=0, end=100), height=450, @@ -102,23 +102,28 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna tools="box_zoom,reset,save") # TODO Need to know which rank to get the correct set of top taxa - # taxid_name_custom = CustomJSHover( - # args=dict(dict_d_taxname=ColumnDataSource(dict(dict_d_taxname=[dict_d_taxname])), - # dict_d_topobs=ColumnDataSource(dict(dict_d_topobs=[dict_d_topobs]))), - # code=''' - # //var taxid = dict_d_topobs.data.dict_d_topobs[0][RANK][value]; - # //console.log(dict_d_topobs); - # //return dict_d_taxname.data.dict_d_taxname[0][taxid]; // value holds the @taxid - # ''') + taxid_name_custom = CustomJSHover( + args=dict(dict_d_taxname=ColumnDataSource(dict(dict_d_taxname=[dict_d_taxname])), + dict_d_topobs=ColumnDataSource(dict(dict_d_topobs=[dict_d_topobs])), + rank_select=rank_select), + code=''' + // value holds the column index + var taxid = dict_d_topobs.data.dict_d_topobs[0][rank_select.value][value]; + if(taxid!=undefined){ + return dict_d_taxname.data.dict_d_taxname[0][taxid]; + }else{ + return value; + } + ''') # Add custom tooltip for heatmap (taxid->name) obsbars_fig.add_tools(HoverTool( tooltips=[("Sample", "@index"), - ("Label", "$name"), - ("Value", "@$name")], + ("Observation", "$name{custom}"), + ("Value", "@$name{0.2f}%")], mode="mouse", point_policy="follow_mouse", - #formatters={"$name": taxid_name_custom} + formatters={"$name": taxid_name_custom} )) bars = [str(i) for i in range(top_obs_bars)] + ["others", "unassigned"] From 15bb9a700e3b93d77e1139c9c9a4c401823b2c15 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 28 Jan 2022 09:50:49 +0100 Subject: [PATCH 20/50] counts from 1 obsbars --- grimer/callbacks.py | 2 +- grimer/cds.py | 23 +++++++---------------- grimer/grimer.py | 3 +-- grimer/plots.py | 5 ++--- 4 files changed, 11 insertions(+), 22 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index d41da2c..5284580 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -719,7 +719,7 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cd // sum counts for sample sum_assigned[s]+=val; // update legend label - legend.items[i].label = i.toString() + ") " + dict_d_taxname[taxid]; + legend.items[i].label = (i+1).toString() + ") " + dict_d_taxname[taxid]; } // not sync with gui // https://github.com/bokeh/bokeh/issues/10211 diff --git a/grimer/cds.py b/grimer/cds.py index 8d9edb9..2b7cbe4 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -11,6 +11,10 @@ def generate_dict_taxname(tax, taxids): + """ + mapping taxids to names + (or names to names if taxid is not used) + """ id_name = {} for i in taxids: n = tax.name(i) if tax else i @@ -37,6 +41,7 @@ def generate_cds_plot_references(table, tax, references): print_df(df_references, "df_references -> cds_p_references") return ColumnDataSource(df_references) + def generate_cds_annotations(table, references, controls, decontam): # Stacked matrix of true annotations (omit false) # index -> taxids @@ -262,7 +267,7 @@ def generate_cds_plot_decontam_models(decontam): def generate_dict_sampleobs(table): - # dict with raw counts + # dict with raw counts (not storing zeros) # dict_sampleobs[rank][obs][sample] = count dict_sampleobs = {} for rank in table.ranks(): @@ -277,20 +282,6 @@ def generate_dict_sampleobs(table): return dict_sampleobs -def generate_cds_sampleobs(table): - # matrix-like cds with raw counts - # index -> sample-ids - # columns -> taxids (from all ranks) - # values are observation raw counts - df_sampleobs = pd.DataFrame(index=table.samples) - for rank in table.ranks(): - df_sampleobs = pd.concat([df_sampleobs, table.data[rank]], axis=1) - # fill NaN with zero so bars do not "dissapear" when plotting - df_sampleobs.fillna(0, inplace=True) - print_df(df_sampleobs, "df_sampleobs -> cds_d_sampleobs") - return ColumnDataSource(df_sampleobs) - - def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): # Stacked matrix of raw counts + transformed value # index -> sample-ids (repeated) @@ -454,7 +445,7 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): def generate_cds_obsbars(table, top_obs_bars): # index (unique sample-ids) - # cols: 0, 1, ..., top_obs_bars, unassigned, others, factors + # cols: 1, 2, ..., top_obs_bars, unassigned, others, factors #Load with data from first rank top_taxids = table.get_top(table.ranks()[0], top_obs_bars) diff --git a/grimer/grimer.py b/grimer/grimer.py index 95b5a62..b52bbae 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -260,8 +260,7 @@ def main(): cds_p_obsbars = generate_cds_obsbars(table, args.top_obs_bars) # _d_ - # matrix: index (unique sample-ids), columns (unique observations) -> raw counts - #cds_d_sampleobs = generate_cds_sampleobs(table) + # dict: {rank: {obs: {sample: count}}} dict_d_sampleobs = generate_dict_sampleobs(table) # df: index (unique sample-ids), aux|..., cnt|..., cds_d_samples = generate_cds_samples(table, references, controls, decontam) diff --git a/grimer/plots.py b/grimer/plots.py index 4871f3c..d7ab9e2 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -164,7 +164,7 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna legend_bars_items = [] for i in range(top_obs_bars): if i < len(dict_d_topobs[ranks[0]]): - label = str(i) + ") " + dict_d_taxname[dict_d_topobs[ranks[0]][i]] + label = str(i+1) + ") " + dict_d_taxname[dict_d_topobs[ranks[0]][i]] else: label = None legend_bars_items.append((label, [vbar_ren[i]])) @@ -194,8 +194,7 @@ def plot_obsbars_widgets(ranks, metadata, dict_d_topobs, dict_d_taxname, top_obs sort_options["Default"].append(("col|others", "others")) sort_options["Default"].append(("col|unassigned", "unassigned")) - #sort_options["Observation"] = [("col|" + str(i), dict_d_taxname[t]) for i, t in enumerate(dict_d_topobs[ranks[0]])] - sort_options["Observation"] = [("col|" + str(i), str(i)) for i in range(top_obs_bars)] + sort_options["Observation"] = [("col|" + str(i), str(i+1)) for i in range(top_obs_bars)] sort_options["Numeric Metadata"] = [] if metadata: From c9e16ac7faf2ddf84333f1cb46baabb457b3d6de Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 2 Feb 2022 16:12:46 +0100 Subject: [PATCH 21/50] samples tab with selection working --- grimer/callbacks.py | 92 +++++++++++++++++++++++++-------------------- grimer/cds.py | 22 ++++++++++- grimer/grimer.py | 13 ++++++- grimer/js/func.js | 16 +++++++- grimer/layout.py | 34 ++++++++++++----- grimer/plots.py | 64 +++++++++++++++++++++++++++++++ 6 files changed, 186 insertions(+), 55 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 5284580..882454b 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -681,7 +681,7 @@ def link_correlation_widgets(ele, cds_p_correlation): ele["correlation"]["wid"]["rank_select"].js_on_change('value', rank_select_callback) -def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cds_d_samples, top_obs_bars, dict_d_taxname, cds_d_metadata): +def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cds_d_samples, top_obs_bars, dict_d_taxname, cds_d_metadata, cds_p_sampletable): rank_select_callback = CustomJS( args=dict(sort_select=ele["obsbars"]["wid"]["sort_select"], legend=ele["obsbars"]["legend"], @@ -740,60 +740,71 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cd sort_select=ele["obsbars"]["wid"]["sort_select"], groupby1_select=ele["obsbars"]["wid"]["groupby1_select"], groupby2_select=ele["obsbars"]["wid"]["groupby2_select"], + cds_p_sampletable=cds_p_sampletable, cds_p_obsbars=cds_p_obsbars, cds_d_samples=cds_d_samples, cds_d_metadata=cds_d_metadata), code=''' - // Define value from Sort by select - var sort_col; - var annot_samples = cds_d_samples.data["index"]; - if (sort_select.value=="input_order"){ - sort_col = cds_d_samples.data["aux|input_order"]; - }else if (sort_select.value.startsWith("metadata_num|")){ - sort_col = cds_d_metadata.data[sort_select.value.replace('metadata_num|','')]; - // Annotate label with value - var annot_samples = annot_samples.map(function(s, i) { - return sort_col[i] + " | " + s; - }); - - }else if (sort_select.value.startsWith("col|")){ - sort_col = cds_p_obsbars.data[sort_select.value.replace('col|','')]; - } // Factors can be: index (sort_col|index), [md1, index] or [md1, md2, index] - var factors; - var sorted_factors; - // If group by is selected, use as first sort factor - if(groupby1_select.value!="none"){ - var groupby_col1 = cds_d_metadata.data[groupby1_select.value.replace('metadata_cat|','')]; - - // Zip sample index and metadata field to create nested factors - factors = groupby_col1.map(function(m, i) { - return [m, annot_samples[i]]; - }); + var factors = []; + var sorted_factors = []; - // second grouping level - if(groupby2_select.value!="none" && groupby2_select.value!=groupby1_select.value){ + // get index of selected indices + var selected_indices = cds_p_sampletable.selected.indices; + + if(selected_indices.length){ + // samples + var annot_samples = cds_d_samples.data["index"]; + + // Define value from Sort by select + var sort_col; + if (sort_select.value=="input_order"){ + sort_col = cds_d_samples.data["aux|input_order"]; + }else if (sort_select.value.startsWith("metadata_num|")){ + sort_col = cds_d_metadata.data[sort_select.value.replace('metadata_num|','')]; + // Annotate label with value + var annot_samples = annot_samples.map(function(s, i) { + return sort_col[i] + " | " + s; + }); + }else if (sort_select.value.startsWith("col|")){ + sort_col = cds_p_obsbars.data[sort_select.value.replace('col|','')]; + } - var groupby_col2 = cds_d_metadata.data[groupby2_select.value.replace('metadata_cat|','')]; + // If group by is selected, use as first sort factor + if(groupby1_select.value!="none"){ + var groupby_col1 = cds_d_metadata.data[groupby1_select.value.replace('metadata_cat|','')]; - factors = groupby_col2.map(function(m, i) { - return [m, groupby_col1[i], annot_samples[i]]; + // Zip sample index and metadata field to create nested factors + factors = groupby_col1.map(function(m, i) { + return [m, annot_samples[i]]; }); - sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1, groupby_col2); + // second grouping level + if(groupby2_select.value!="none" && groupby2_select.value!=groupby1_select.value){ + + var groupby_col2 = cds_d_metadata.data[groupby2_select.value.replace('metadata_cat|','')]; + + factors = groupby_col2.map(function(m, i) { + return [m, groupby_col1[i], annot_samples[i]]; + }); + + // only selected_indices + sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1, groupby_col2, selected_indices); + }else{ + sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1, [], selected_indices); + } + }else{ - sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1); + // Single factors, just use the sample index + factors = annot_samples; + sorted_factors = grimer_sort(factors, sort_col, "numeric", false, [], [], selected_indices); } - }else{ - // Single factors, just use the sample index - factors = annot_samples; - sorted_factors = grimer_sort(factors, sort_col, "numeric", false); - } + // Change value of the factors on the obsbars cds + cds_p_obsbars.data["factors"] = factors; - // Change value of the factors on the obsbars cds - cds_p_obsbars.data["factors"] = factors; + } // Plot sorted factors obsbars.x_range.factors = sorted_factors; @@ -811,6 +822,7 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cd } ''') + cds_p_sampletable.selected.js_on_change('indices', sort_groupby_callback) ele["obsbars"]["wid"]["toggle_label"].js_on_click(toggle_label_callback) ele["obsbars"]["wid"]["groupby1_select"].js_on_change('value', sort_groupby_callback) ele["obsbars"]["wid"]["groupby2_select"].js_on_change('value', sort_groupby_callback) diff --git a/grimer/cds.py b/grimer/cds.py index 2b7cbe4..7007cfc 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -134,7 +134,27 @@ def generate_cds_obstable(table, tax, references, controls, control_samples, dec return ColumnDataSource(df_obstable) -def generate_cds_bars(table): +def generate_cds_sampletable(table): + # index unique sample-ids + # col|... values to plot to columns in the datatable + + df_sampletable = pd.DataFrame(index=table.samples) + df_sampletable["col|total"] = table.total + assigned = table.total - table.unassigned + df_sampletable["col|assigned"] = assigned + df_sampletable["col|assigned_perc"] = assigned.divide(table.total, axis=0) + df_sampletable["col|unassigned"] = table.unassigned + df_sampletable["col|unassigned_perc"] = table.unassigned.divide(table.total, axis=0) + + # assigned by rank + for rank in table.ranks(): + df_sampletable["col|" + rank] = table.data[rank].sum(axis=1).divide(table.total, axis=0) + + print_df(df_sampletable, "df_sampletable -> cds_p_sampletable") + return ColumnDataSource(df_sampletable) + + +def generate_cds_samplebars(table): # index unique sample-ids # aux| auxiliary values (not plotted) # bar| values plotted as bars (sample counts) diff --git a/grimer/grimer.py b/grimer/grimer.py index b52bbae..e404b03 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -237,7 +237,7 @@ def main(): # this cds an exeption and contains data to plot (col|) and auxiliary data (tax|) cds_p_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam) # df: index (unique sample-ids), aux|..., bar|..., tax|... - cds_p_samplebars = generate_cds_bars(table) + cds_p_samplebars = generate_cds_samplebars(table) # stacked: index (repeated observations), rank, ref, direct, parent cds_p_references = generate_cds_plot_references(table, tax, references) # matrix: index (unique sample-ids), concentrations, controls, counts @@ -258,6 +258,8 @@ def main(): cds_p_correlation = generate_cds_correlation(table, args.top_obs_corr, replace_zero_value) # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors cds_p_obsbars = generate_cds_obsbars(table, args.top_obs_bars) + # df: index (unique sample-ids), col|..., tax|..., aux|ref + cds_p_sampletable = generate_cds_sampletable(table) # _d_ # dict: {rank: {obs: {sample: count}}} @@ -332,6 +334,11 @@ def main(): ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, max_total_count, table.ranks()) ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, list(references.keys()), list(controls.keys()), decontam) + # sampletable + ele["sampletable"] = {} + ele["sampletable"]["fig"] = plot_sampletable(cds_p_sampletable, sizes, table.ranks()) + ele["sampletable"]["wid"] = plot_sampletable_widgets(sizes, max(cds_p_sampletable.data["col|total"]), metadata) + # heatmap tools_heatmap = "hover,save,box_zoom,reset,crosshair,box_select" ele["heatmap"] = {} @@ -390,6 +397,7 @@ def main(): cds_p_mgnify, dict_d_refs) + link_heatmap_widgets(ele, cds_d_samples, cds_d_metadata, @@ -414,7 +422,8 @@ def main(): cds_d_samples, args.top_obs_bars, dict_d_taxname, - cds_d_metadata) + cds_d_metadata, + cds_p_sampletable) ############ LAYOUT diff --git a/grimer/js/func.js b/grimer/js/func.js index ee1c52d..bab3693 100644 --- a/grimer/js/func.js +++ b/grimer/js/func.js @@ -1,8 +1,20 @@ function sort_numeric(a, b){ return a - b; } function sort_string(a, b){ return a.localeCompare(b); } -function grimer_sort(factors, sort_col, sort_mode="numeric", desc=false, group_col1=[], group_col2=[]) { - //mode : numeric, string +function grimer_sort(factors, sort_col, sort_mode="numeric", desc=false, group_col1=[], group_col2=[], index=[]) { + //sort_mode : numeric, string + + // subset data if index provided + if(index.length){ + factors = index.map( s => factors[s] ); + sort_col = index.map( s => sort_col[s] ); + if(group_col1.length){ + group_col1 = index.map( s => group_col1[s] ); + } + if(group_col2.length){ + group_col2 = index.map( s => group_col2[s] ); + } + } // Generate numerical index to sort arrays var idx = new Array(factors.length); diff --git a/grimer/layout.py b/grimer/layout.py index 11c93f1..75aefcb 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -58,6 +58,28 @@ def make_layout(ele, sizes, version, logo_path, title): ele["samplebars"]["wid"]["help_button"]), ele["samplebars"]["wid"]["toggle_label"]) + selectwidgets = column(ele["sampletable"]["wid"]["total_counts_spinner"], + ele["sampletable"]["wid"]["assigned_spinner"], + ele["sampletable"]["wid"]["metadata_multichoice"], + ele["sampletable"]["wid"]["help_button"]) + + selectwidgetstabs = Tabs(tabs=[Panel(child=selectwidgets, title="Select")], + sizing_mode="fixed", + height=sizes["overview_top_panel_height"] + 20, + width=sizes["overview_top_panel_width_left"]) + + row_sampletable = row(selectwidgetstabs, + ele["sampletable"]["fig"], + sizing_mode="stretch_width") + + row_obsbars = column(row(ele["obsbars"]["fig"]), + row(ele["obsbars"]["wid"]["rank_select"], + ele["obsbars"]["wid"]["groupby1_select"], + ele["obsbars"]["wid"]["groupby2_select"], + ele["obsbars"]["wid"]["sort_select"], + ele["obsbars"]["wid"]["help_button"]), + ele["obsbars"]["wid"]["toggle_label"]) + row_heatmap = gridplot([[ele["heatmap"]["fig"], ele["dendroy"]["fig"], ele["metadata"]["fig"]], [ele["dendrox"]["fig"]], [ele["annotations"]["fig"], ele["heatmap"]["wid"]["help_button"]]], @@ -88,19 +110,11 @@ def make_layout(ele, sizes, version, logo_path, title): ele["correlation"]["wid"]["help_button"]), ele["correlation"]["fig"]) - row_obsbars = column(row(ele["obsbars"]["fig"]), - row(ele["obsbars"]["wid"]["rank_select"], - ele["obsbars"]["wid"]["groupby1_select"], - ele["obsbars"]["wid"]["groupby2_select"], - ele["obsbars"]["wid"]["sort_select"], - ele["obsbars"]["wid"]["help_button"]), - ele["obsbars"]["wid"]["toggle_label"]) - main_panels = [] main_panels.append(Panel(child=column(row_obstable, row_barpot, sizing_mode="stretch_width"), title="Overview")) + main_panels.append(Panel(child=column(row_sampletable, row_obsbars, sizing_mode="stretch_width"), title="Samples")) main_panels.append(Panel(child=column(row_heatmap, row_heatmap_widgets, sizing_mode="stretch_width"), title="Heatmap")) main_panels.append(Panel(child=column(row_correlation, sizing_mode="stretch_width"), title="Correlation")) - main_panels.append(Panel(child=column(row_obsbars, sizing_mode="stretch_width"), title="Bars")) main_tab = Tabs(tabs=main_panels) logo_base64 = base64.b64encode(open(logo_path, 'rb').read()) # encode to base64 @@ -112,4 +126,4 @@ def make_layout(ele, sizes, version, logo_path, title): title_div = Spacer() final = column([row(logo_div, title_div), main_tab], sizing_mode="stretch_width") - return final \ No newline at end of file + return final diff --git a/grimer/plots.py b/grimer/plots.py index d7ab9e2..9a93feb 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -372,6 +372,70 @@ def plot_obstable_widgets(sizes, dict_d_taxname, max_count_rank): "help_button": help_button(title="Observation table", text=help_text, align="start")} + + +def plot_sampletable(cds_p_sampletable, sizes, ranks): + + table_cols = [] + table_cols.append(TableColumn(field="index", title="Sample")) + table_cols.append(TableColumn(field="col|total", title="Total counts", default_sort="descending")) + table_cols.append(TableColumn(field="col|assigned", title="Assigned")) + table_cols.append(TableColumn(field="col|assigned_perc", title="Assigned %", default_sort="descending", formatter=NumberFormatter(format="0.00%"))) + table_cols.append(TableColumn(field="col|unassigned", title="Unassigned")) + table_cols.append(TableColumn(field="col|unassigned_perc", title="Unassigned %", formatter=NumberFormatter(format="0.00%"))) + + # Pre-select all checkboxes + cds_p_sampletable.selected.indices = list(range(len(cds_p_sampletable.data["index"]))) + + for rank in ranks: + table_cols.append(TableColumn(field="col|" + rank, title=rank, formatter=NumberFormatter(format="0.00%"))) + + sampletable = DataTable(height=sizes["overview_top_panel_height"], + sizing_mode="stretch_width", + index_position=None, + autosize_mode="fit_viewport", + selectable="checkbox", + frozen_columns=1, + columns=table_cols, + source=cds_p_sampletable) + + return sampletable + + +def plot_sampletable_widgets(sizes, max_count_samples, metadata): + # Filtering options + spinner_width = sizes["overview_top_panel_width_left"] - 20 + + total_counts_spinner = Spinner(title="Total counts", low=1, high=max_count_samples, step=1, value=1, width=spinner_width, height=50) + assigned_spinner = Spinner(title="Assigned", low=0, high=100, value=0, step=0.1, width=spinner_width, height=50) + + if metadata: + metadata_values = [] + for field in metadata.get_data().columns.to_list(): + for value in metadata.get_unique_values(field): + metadata_values.append((field + "|" + str(value), field + " = " + str(value))) + + metadata_multichoice = MultiChoice(title="Metadata", + options=metadata_values, + sizing_mode="fixed", + width=sizes["overview_top_panel_width_left"] - 20, height=60) + else: + metadata_multichoice = None + + help_text = """ +helppp + +samples + +aaa +""" + + return {"total_counts_spinner": total_counts_spinner, + "assigned_spinner": assigned_spinner, + "metadata_multichoice": metadata_multichoice, + "help_button": help_button(title="Sample selection", text=help_text, align="start")} + + def plot_infopanel(): return TextAreaInput(value="Click on the table items to load more information", sizing_mode="stretch_both", From e673a9985a4ba943944571e83132b209cf0cbae5 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 2 Feb 2022 19:12:21 +0100 Subject: [PATCH 22/50] select filtering working --- grimer/callbacks.py | 40 ++++++++++++++++++++++++++++++++++++++++ grimer/cds.py | 2 ++ grimer/grimer.py | 5 +++-- grimer/plots.py | 4 ++-- 4 files changed, 47 insertions(+), 4 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 882454b..90d2843 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -828,3 +828,43 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cd ele["obsbars"]["wid"]["groupby2_select"].js_on_change('value', sort_groupby_callback) ele["obsbars"]["wid"]["sort_select"].js_on_change('value', sort_groupby_callback) ele["obsbars"]["wid"]["rank_select"].js_on_change('value', rank_select_callback, sort_groupby_callback) + + +def link_sampletable_select(ele, cds_p_sampletable, cds_d_metadata): + + select_callback = CustomJS( + args=dict(cds_p_sampletable=cds_p_sampletable, + cds_d_metadata=cds_d_metadata, + total_counts_spinner=ele["sampletable"]["wid"]["total_counts_spinner"], + assigned_spinner=ele["sampletable"]["wid"]["assigned_spinner"], + metadata_multichoice=ele["sampletable"]["wid"]["metadata_multichoice"], + ), + code=''' + var selected_indices = []; + for (var i = 0; i < cds_p_sampletable.length; i++) { + if (cds_p_sampletable.data['col|total'][i] < total_counts_spinner.value){ + continue; + } + if (cds_p_sampletable.data['col|assigned_perc'][i] < (assigned_spinner.value/100)){ + continue; + } + if (metadata_multichoice.value.length > 0 ){ + var found = false; + for (var m=0; m < metadata_multichoice.value.length; ++m){ + const md = metadata_multichoice.value[m].split("|"); + if(cds_d_metadata.data[md[0]][i]==md[1]){ + found = true; + break; + } + } + if (!found) { + continue; + } + } + selected_indices.push(i); + } + cds_p_sampletable.selected.indices = selected_indices; + ''') + ele["sampletable"]["wid"]["total_counts_spinner"].js_on_change('value', select_callback) + ele["sampletable"]["wid"]["assigned_spinner"].js_on_change('value', select_callback) + ele["sampletable"]["wid"]["metadata_multichoice"].js_on_change('value', select_callback) diff --git a/grimer/cds.py b/grimer/cds.py index 7007cfc..c7aed83 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -150,6 +150,8 @@ def generate_cds_sampletable(table): for rank in table.ranks(): df_sampletable["col|" + rank] = table.data[rank].sum(axis=1).divide(table.total, axis=0) + df_sampletable.fillna(0, inplace=True) + print_df(df_sampletable, "df_sampletable -> cds_p_sampletable") return ColumnDataSource(df_sampletable) diff --git a/grimer/grimer.py b/grimer/grimer.py index e404b03..7a4eed0 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -258,7 +258,7 @@ def main(): cds_p_correlation = generate_cds_correlation(table, args.top_obs_corr, replace_zero_value) # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors cds_p_obsbars = generate_cds_obsbars(table, args.top_obs_bars) - # df: index (unique sample-ids), col|..., tax|..., aux|ref + # df: index (unique sample-ids), col|... cds_p_sampletable = generate_cds_sampletable(table) # _d_ @@ -397,7 +397,6 @@ def main(): cds_p_mgnify, dict_d_refs) - link_heatmap_widgets(ele, cds_d_samples, cds_d_metadata, @@ -425,6 +424,8 @@ def main(): cds_d_metadata, cds_p_sampletable) + link_sampletable_select(ele, cds_p_sampletable, cds_d_metadata) + ############ LAYOUT # Define path of running script to get static files diff --git a/grimer/plots.py b/grimer/plots.py index 9a93feb..1132a95 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -407,7 +407,7 @@ def plot_sampletable_widgets(sizes, max_count_samples, metadata): spinner_width = sizes["overview_top_panel_width_left"] - 20 total_counts_spinner = Spinner(title="Total counts", low=1, high=max_count_samples, step=1, value=1, width=spinner_width, height=50) - assigned_spinner = Spinner(title="Assigned", low=0, high=100, value=0, step=0.1, width=spinner_width, height=50) + assigned_spinner = Spinner(title="Assigned %", low=0, high=100, value=0, step=1, width=spinner_width, height=50) if metadata: metadata_values = [] @@ -415,7 +415,7 @@ def plot_sampletable_widgets(sizes, max_count_samples, metadata): for value in metadata.get_unique_values(field): metadata_values.append((field + "|" + str(value), field + " = " + str(value))) - metadata_multichoice = MultiChoice(title="Metadata", + metadata_multichoice = MultiChoice(title="Metadata (union)", options=metadata_values, sizing_mode="fixed", width=sizes["overview_top_panel_width_left"] - 20, height=60) From 23edc62cdb0f0e3487f8b008bfaa4b823e1ac2e1 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 3 Feb 2022 14:21:15 +0100 Subject: [PATCH 23/50] help plots samples --- grimer/grimer.py | 5 ++++- grimer/plots.py | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/grimer/grimer.py b/grimer/grimer.py index 7a4eed0..b9b83b0 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -356,7 +356,10 @@ def main(): # annotations ele["annotations"] = {} - ele["annotations"]["fig"] = plot_annotations(ele["heatmap"]["fig"], tools_heatmap, cds_p_annotations, dict_d_taxname) + if cds_p_annotations.data["index"].size: + ele["annotations"]["fig"] = plot_annotations(ele["heatmap"]["fig"], tools_heatmap, cds_p_annotations, dict_d_taxname) + else: + ele["annotations"]["fig"] = Spacer() # dendrograms ele["dendrox"] = {} diff --git a/grimer/plots.py b/grimer/plots.py index 1132a95..aa1c820 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -415,7 +415,7 @@ def plot_sampletable_widgets(sizes, max_count_samples, metadata): for value in metadata.get_unique_values(field): metadata_values.append((field + "|" + str(value), field + " = " + str(value))) - metadata_multichoice = MultiChoice(title="Metadata (union)", + metadata_multichoice = MultiChoice(title="Metadata", options=metadata_values, sizing_mode="fixed", width=sizes["overview_top_panel_width_left"] - 20, height=60) @@ -423,11 +423,11 @@ def plot_sampletable_widgets(sizes, max_count_samples, metadata): metadata_multichoice = None help_text = """ -helppp +Summary of samples. Entries selected in the table are shown in the barplot below. -samples +Widgets can select batches of entries in the table by multiple criteria. -aaa +Multiple metadata fields/values can be chosen and the union of the matching results will be selected in the table. """ return {"total_counts_spinner": total_counts_spinner, @@ -658,7 +658,7 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax # Need to pass dict_d_taxname inside a one column data taxid_name_custom = CustomJSHover( args=dict(dict_d_taxname=ColumnDataSource(dict(dict_d_taxname=[dict_d_taxname]))), - code="return dict_d_taxname.data.dict_d_taxname[0][value]; // value holds the @taxid" + code="console.log(special_vars); return dict_d_taxname.data.dict_d_taxname[0][value]; // value holds the @taxid" ) # Add custom tooltip for heatmap (taxid->name) heatmap.add_tools(HoverTool( From cc0542d001cf5ff440aa00e6bb16ce68aa63ecce Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 3 Feb 2022 18:05:32 +0100 Subject: [PATCH 24/50] bug fix correlation --- grimer/callbacks.py | 11 +++++++---- grimer/cds.py | 47 +++++++++++++++++++++++---------------------- grimer/grimer.py | 2 +- grimer/metadata.py | 3 +++ grimer/plots.py | 12 ++++++------ grimer/utils.py | 13 ++++--------- 6 files changed, 45 insertions(+), 43 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 90d2843..7329198 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -647,14 +647,16 @@ def link_correlation_widgets(ele, cds_p_correlation): args=dict(correlation=ele["correlation"]["fig"], cds_p_correlation=cds_p_correlation), code=''' - const factors = new Set(); + var factors = new Set(); for(let i = 0; i < cds_p_correlation.data["index"].length; i++){ if(cds_p_correlation.data["rank"][i]==this.value){ factors.add(cds_p_correlation.data["index"][i]); + factors.add(cds_p_correlation.data["taxid"][i]); } } - correlation.x_range.factors = [...factors]; - correlation.y_range.factors = [...factors].reverse(); + factors = [...factors].sort(); + correlation.x_range.factors = factors; + correlation.y_range.factors = factors.reverse(); ''') filter_callback = CustomJS( @@ -867,4 +869,5 @@ def link_sampletable_select(ele, cds_p_sampletable, cds_d_metadata): ''') ele["sampletable"]["wid"]["total_counts_spinner"].js_on_change('value', select_callback) ele["sampletable"]["wid"]["assigned_spinner"].js_on_change('value', select_callback) - ele["sampletable"]["wid"]["metadata_multichoice"].js_on_change('value', select_callback) + if cds_d_metadata: + ele["sampletable"]["wid"]["metadata_multichoice"].js_on_change('value', select_callback) diff --git a/grimer/cds.py b/grimer/cds.py index c7aed83..499ef7d 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -38,7 +38,7 @@ def generate_cds_plot_references(table, tax, references): df_references = pd.DataFrame(clist, columns=["obs", "rank", "ref", "direct", "parent"]) df_references.set_index('obs', inplace=True) - print_df(df_references, "df_references -> cds_p_references") + print_df(df_references, "cds_p_references") return ColumnDataSource(df_references) @@ -73,7 +73,7 @@ def generate_cds_annotations(table, references, controls, decontam): # Concat in the main df df_annotations = pd.concat([df_annotations, df_rank], axis=0) - print_df(df_annotations, "df_annotations -> cds_p_annotations") + print_df(df_annotations, "cds_p_annotations") return ColumnDataSource(df_annotations) @@ -130,7 +130,7 @@ def generate_cds_obstable(table, tax, references, controls, control_samples, dec # Concat in the main df df_obstable = pd.concat([df_obstable, df_rank], axis=0) - print_df(df_obstable, "df_obstable -> cds_p_obstable") + print_df(df_obstable, "cds_p_obstable") return ColumnDataSource(df_obstable) @@ -152,7 +152,7 @@ def generate_cds_sampletable(table): df_sampletable.fillna(0, inplace=True) - print_df(df_sampletable, "df_sampletable -> cds_p_sampletable") + print_df(df_sampletable, "cds_p_sampletable") return ColumnDataSource(df_sampletable) @@ -174,7 +174,7 @@ def generate_cds_samplebars(table): for rank in table.ranks(): df_bars["tax|" + rank] = None - print_df(df_bars, "df_bars -> cds_p_samplebars") + print_df(df_bars, "cds_p_samplebars") return ColumnDataSource(df_bars) @@ -216,7 +216,7 @@ def generate_cds_samples(table, references, controls, decontam): # fill NaN with zero so bars do not "dissapear" when plotting df_samples.fillna(0, inplace=True) - print_df(df_samples, "df_samples -> cds_d_samples") + print_df(df_samples, "cds_d_samples") return ColumnDataSource(df_samples) @@ -225,7 +225,7 @@ def generate_cds_metadata(metadata): # columns -> metadata fields # values -> metadata values df_md = metadata.get_data() - print_df(df_md, "df_md -> cds_d_metadata") + print_df(df_md, "cds_d_metadata") return ColumnDataSource(df_md) @@ -242,7 +242,7 @@ def generate_cds_plot_metadata(metadata, max_metadata_cols): #df_plot_md[str(i+1)] = [(metadata_fields[i], '{:.16g}'.format(md_value) if not isinstance(md_value, str) else md_value) for md_value in metadata.get_col(metadata_fields[i])] df_plot_md[str(i + 1)] = [(metadata_fields[i], format_js_toString(md_value)) for md_value in metadata.get_col(metadata_fields[i])] - print_df(df_plot_md, "df_plot_md -> cds_p_metadata") + print_df(df_plot_md, "cds_p_metadata") return ColumnDataSource(df_plot_md) @@ -254,7 +254,7 @@ def generate_cds_plot_decontam(decontam): df_decontam = decontam.get_data() df_decontam["controls"] = df_decontam["controls"].map({True: 'Control', False: 'Sample'}) df_decontam["counts"] = None - print_df(df_decontam, "df_decontam -> cds_p_decontam") + print_df(df_decontam, "cds_p_decontam") return ColumnDataSource(df_decontam) @@ -270,7 +270,7 @@ def generate_cds_decontam(decontam, ranks): vals = list(zip(df_valid_vals["contam"], df_valid_vals["contam_2"], df_valid_vals["non.contam"], pval)) dict_coord_mod.update(dict(zip(df_valid_vals.index, vals))) - print_df(dict_coord_mod, "dict_coord_mod -> cds_d_decontam_models") + print_df(dict_coord_mod, "cds_d_decontam_models") return ColumnDataSource(dict_coord_mod) @@ -284,7 +284,7 @@ def generate_cds_plot_decontam_models(decontam): decontam.get_data()["concentration"].max()] dict_decontam_models["y_cont"] = [None, None] dict_decontam_models["y_noncont"] = [None, None] - print_df(dict_decontam_models, "dict_decontam_models -> cds_p_decontam_models") + print_df(dict_decontam_models, "cds_p_decontam_models") return ColumnDataSource(dict_decontam_models) @@ -300,7 +300,7 @@ def generate_dict_sampleobs(table): if val > 0: dict_sampleobs[rank][obs][sample] = val - print_df(dict_sampleobs, "dict_sampleobs -> dict_d_sampleobs") + print_df(dict_sampleobs, "dict_d_sampleobs") return dict_sampleobs @@ -326,7 +326,8 @@ def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): df_heatmap = pd.concat([df_heatmap, stacked_rank_df], axis=0) - print_df(df_heatmap, "df_heatmap -> cds_p_heatmap") + df_heatmap.drop('ov', axis=1, inplace=True) + print_df(df_heatmap, "cds_p_heatmap") return ColumnDataSource(df_heatmap) @@ -350,8 +351,8 @@ def generate_dict_hcluster(table, hcluster): # taxa leaves_x[key] = hcluster[rank][method][metric]["x"]["index"] - print_df(leaves_x, "leaves_x -> dict_d_hcluster_x") - print_df(leaves_y, "leaves_y -> dict_d_hcluster_y") + print_df(leaves_x, "dict_d_hcluster_x") + print_df(leaves_y, "dict_d_hcluster_y") return leaves_x, leaves_y @@ -359,8 +360,8 @@ def generate_cds_plot_dendro(): # Empty CDS {"x": [], "y": [], "c": []} dendro_x = {"x": [], "y": [], "c": []} dendro_y = {"x": [], "y": [], "c": []} - print_df(dendro_x, "dendro_x -> cds_p_dendro_x") - print_df(dendro_y, "dendro_y -> cds_p_dendro_y") + print_df(dendro_x, "cds_p_dendro_x") + print_df(dendro_y, "cds_p_dendro_y") return ColumnDataSource(dendro_x), ColumnDataSource(dendro_y) @@ -391,7 +392,7 @@ def generate_dict_topobs(table, top_obs_bars): dict_top_taxa = {} for rank in table.ranks(): dict_top_taxa[rank] = table.get_top(rank, top_obs_bars) - print_df(dict_top_taxa, "dict_top_taxa -> dict_d_topobs") + print_df(dict_top_taxa, "dict_d_topobs") return dict_top_taxa @@ -418,7 +419,7 @@ def generate_dict_refs(table, references): d_refs[i][sname][desc] = [] d_refs[i][sname][desc].append(ref) - print_df(d_refs, "d_refs -> dict_d_refs") + print_df(d_refs, "dict_d_refs") return d_refs @@ -445,7 +446,7 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): if len(matrix.columns) == 2: # If there are only 2 observations, return in a float # re-format in a matrix shape - rho = np.array([[np.nan, np.nan], [rho, np.nan]]) + rho = np.array([[np.nan, np.nan], [rho[1, 0], np.nan]]) else: # fill upper triangular matrix (mirrored values) with nan to be ignored by pandas # to save half of the space @@ -461,7 +462,7 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): df_corr = pd.concat([df_corr, stacked_rank_df], axis=0) - print_df(df_corr, "df_corr -> cds_p_correlation") + print_df(df_corr, "cds_p_correlation") return ColumnDataSource(df_corr) @@ -484,7 +485,7 @@ def generate_cds_obsbars(table, top_obs_bars): df_obsbars = transform_table(df_obsbars, table.total, "norm", 0) * 100 df_obsbars["factors"] = df_obsbars.index.to_list() - print_df(df_obsbars, "df_obsbars -> cds_p_obsbars") + print_df(df_obsbars, "cds_p_obsbars") return ColumnDataSource(df_obsbars) @@ -535,5 +536,5 @@ def generate_cds_mgnify(mgnify, table, tax): # set index df_mgnify.set_index('taxa', inplace=True) - print_df(df_mgnify, "df_mgnify -> cds_p_mgnify") + print_df(df_mgnify, "cds_p_mgnify") return ColumnDataSource(df_mgnify) diff --git a/grimer/grimer.py b/grimer/grimer.py index b9b83b0..46d053a 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -67,7 +67,7 @@ def main(): heatmap_group.add_argument('--optimal-ordering', default=False, action='store_true', help="Activate optimal_ordering on linkage, takes longer for large number of samples.") heatmap_group.add_argument('--show-zeros', default=False, action='store_true', help="Do not skip zeros on heatmap. File will be bigger and iteraction with heatmap slower.") heatmap_group.add_argument('--linkage-methods', type=str, nargs="*", default=["complete"], choices=list(_LINKAGE_METHODS)) - heatmap_group.add_argument('--linkage-metrics', type=str, nargs="*", default=["euclidean", "braycurtis"], choices=_METRICS_NAMES) + heatmap_group.add_argument('--linkage-metrics', type=str, nargs="*", default=["euclidean"], choices=_METRICS_NAMES) heatmap_group.add_argument('--skip-dendrogram', default=False, action='store_true', help="Disable dendogram. Will create smaller files.") correlation_group = parser.add_argument_group('Correlation options') diff --git a/grimer/metadata.py b/grimer/metadata.py index 8eabb4c..4160879 100644 --- a/grimer/metadata.py +++ b/grimer/metadata.py @@ -39,6 +39,9 @@ def __init__(self, metadata_file, samples: list=[]): # Convert NaN on categorical to "" self.data[self.types[self.types == "categorical"].index] = self.data[self.types[self.types == "categorical"].index].fillna('') + # Convert boolean to String + mask = self.data[self.types[self.types == "categorical"].index].applymap(type) != bool + self.data[self.types[self.types == "categorical"].index] = self.data[self.types[self.types == "categorical"].index].where(mask, self.data[self.types[self.types == "categorical"].index].replace({True: 'True', False: 'False'})) # Remove names self.data.index.names = [None] diff --git a/grimer/plots.py b/grimer/plots.py index aa1c820..c248525 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -1,7 +1,7 @@ import markdown # Bokeh -from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, FactorRange, FuncTickFormatter, HoverTool, Legend, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, Range1d, RangeSlider, Select, Spinner, Tabs, TextAreaInput, TextInput +from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, FactorRange, FuncTickFormatter, HoverTool, Legend, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, Range1d, RangeSlider, Select, Spacer, Spinner, Tabs, TextAreaInput, TextInput from bokeh.models.filters import IndexFilter, GroupFilter from bokeh.models.widgets import DataTable, TableColumn from bokeh.palettes import Blues, Dark2, Magma256, Reds @@ -420,7 +420,7 @@ def plot_sampletable_widgets(sizes, max_count_samples, metadata): sizing_mode="fixed", width=sizes["overview_top_panel_width_left"] - 20, height=60) else: - metadata_multichoice = None + metadata_multichoice = Spacer() help_text = """ Summary of samples. Entries selected in the table are shown in the barplot below. @@ -658,14 +658,13 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax # Need to pass dict_d_taxname inside a one column data taxid_name_custom = CustomJSHover( args=dict(dict_d_taxname=ColumnDataSource(dict(dict_d_taxname=[dict_d_taxname]))), - code="console.log(special_vars); return dict_d_taxname.data.dict_d_taxname[0][value]; // value holds the @taxid" + code="return dict_d_taxname.data.dict_d_taxname[0][value]; // value holds the @taxid" ) # Add custom tooltip for heatmap (taxid->name) heatmap.add_tools(HoverTool( tooltips=[ ('Sample', '@index'), ('Observation', '@obs{custom}'), - ('Original value', '@ov'), ('Transformed value (' + transformation + ')', '@tv') ], formatters={"@obs": taxid_name_custom} @@ -764,7 +763,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name def plot_dendrogram(heatmap, tools_heatmap, cds_p_dendro_x, cds_p_dendro_y): - + dendrox_fig = figure(x_range=heatmap.x_range, tools="save", height=80, @@ -824,7 +823,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada # (metadata header, str(metadata value)) -> color # Add them to a CategoricalColorMapper which will be applied for the whole plot # Use different palettes for numeric types, but convert to string to be treated as a category - # Need to be careful with int and float, since the value of str(0.0) + # Need to be careful with int and float, since the value of str(0.0) # will not match the 0 in the javascript data conversion, therefore use the numeric to calculate palette # but make get_formatted_unique_values on the dictionary factors = [] @@ -918,6 +917,7 @@ def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): for i, rank in enumerate(cds_p_correlation.data["rank"]): if rank == ranks[0]: taxids.add(cds_p_correlation.data["index"][i]) + taxids.add(cds_p_correlation.data["taxid"][i]) taxids = sorted(taxids) corr_fig = figure(x_range=taxids, diff --git a/grimer/utils.py b/grimer/utils.py index 7e26c81..dd6b012 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -619,21 +619,16 @@ def print_log(text): def print_df(df, name: str=None): from grimer.grimer import _debug if _debug: - print("-----------------------------------------------") print(name) if isinstance(df, dict): if df: - print(list(df.keys())[0]) - print("...") - print(list(df.keys())[-1]) - print(list(df.values())[0]) - print("...") - print(list(df.values())[-1]) - print(len(df.keys())) + print(len(df.keys()), "keys:", list(df.keys())[0], "...", list(df.keys())[-1]) + # print(list(df.values())[0], "...", list(df.values())[-1]) else: - print(df.columns) + #print(df.columns) print(df.head()) print(df.shape) + print("size:", sys.getsizeof(df)) print("-----------------------------------------------") From e51e7a7467a95398c2190c50e6f3ed818b897579 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 4 Feb 2022 11:56:41 +0100 Subject: [PATCH 25/50] merge methods metrics on select, better info --- grimer/callbacks.py | 73 ++++++++++++++++++++++++++++----------------- grimer/grimer.py | 3 +- grimer/layout.py | 13 ++------ grimer/plots.py | 15 +++++----- 4 files changed, 57 insertions(+), 47 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 7329198..5f8527b 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -15,7 +15,8 @@ def link_obstable_samplebars(ele, min_obs_perc, max_total_count, cds_p_mgnify, - dict_d_refs): + dict_d_refs, + dict_d_taxname): bar_select_callback = CustomJS( args=dict(y1_select=ele["samplebars"]["wid"]["y1_select"], @@ -223,7 +224,9 @@ def link_obstable_samplebars(ele, load_infopanel = CustomJS( args=dict(infopanel=ele["infopanel"]["textarea"], cds_p_obstable=cds_p_obstable, - dict_d_refs=dict_d_refs), + dict_d_refs=dict_d_refs, + dict_d_taxname=dict_d_taxname, + active_ranks=active_ranks), code=''' // selected row var row = cb_obj.indices[0]; @@ -232,14 +235,41 @@ def link_obstable_samplebars(ele, const taxid = cds_p_obstable.data['index'][row]; var text = ""; - text+="Name: " + name; + text+="[ Obs ]"; + text+="\\n"; + text+=name; + if(taxid!=name){ + text+="\\n"; + text+="taxid: " + taxid; + } text+="\\n"; - text+="Id: " + taxid; + text+="[ Rank ]"; text+="\\n"; - text+="Rank: " + rank; + text+=rank; text+="\\n"; - text+="\\n" + + var lineage = ""; + + for(let r = 0; r < active_ranks.length; r++){ + var obs_lin = cds_p_obstable.data["tax|" + active_ranks[r]][row]; + if(taxid!=name){ + if(dict_d_taxname[obs_lin]) + lineage+=dict_d_taxname[obs_lin]+" | "; + else + lineage+=" | "; + }else{ + lineage+=obs_lin+" | "; + } + if(active_ranks[r]==rank) + break; + } + text+="[ Lineage ]"; + text+="\\n"; + text+=lineage.slice(0, -3); + text+="\\n"; + + text+="\\n" for (var source in dict_d_refs[taxid]){ text+="[ "+source+" ]"; text+="\\n"; @@ -396,28 +426,22 @@ def link_heatmap_widgets(ele, x_dendro_callback = CustomJS( args=dict(rank_select=ele["heatmap"]["wid"]["rank_select"], x_sort_select=ele["heatmap"]["wid"]["x_sort_select"], - x_method_select=ele["heatmap"]["wid"]["x_method_select"], cds_p_dendro_x=cds_p_dendro_x, dict_d_dedro_x=dict_d_dedro_x), code=''' - if (x_sort_select.value.startsWith("metric|")){ - const key = rank_select.value+"|"+x_method_select.value+"|"+x_sort_select.value.replace("metric|",""); + if (x_sort_select.value.startsWith("cluster|")){ + const key = rank_select.value+"|"+x_sort_select.value.replace("cluster|",""); cds_p_dendro_x.data = {"x": dict_d_dedro_x[key+"|x"], "y": dict_d_dedro_x[key+"|y"], "c": dict_d_dedro_x[key+"|c"]}; - // Enable method select - x_method_select.disabled=false; }else{ cds_p_dendro_x.data = {"x": [], "y": [], "c": []}; - // Disable method select - x_method_select.disabled=true; } ''') x_select_callback = CustomJS( args=dict(heatmap=ele["heatmap"]["fig"], rank_select=ele["heatmap"]["wid"]["rank_select"], - x_method_select=ele["heatmap"]["wid"]["x_method_select"], x_sort_select=ele["heatmap"]["wid"]["x_sort_select"], dict_d_hcluster_x=dict_d_hcluster_x, cds_p_annotations=cds_p_annotations, @@ -425,13 +449,14 @@ def link_heatmap_widgets(ele, code=''' const rank = rank_select.value; var sorted_factors = []; + console.log(x_sort_select.value); if (x_sort_select.value=="none"){ // None sorted_factors = dict_d_hcluster_x["default|" + rank]; - }else if (x_sort_select.value.startsWith("metric|")){ + }else if (x_sort_select.value.startsWith("cluster|")){ // Clustering // Get sorted elements based on rank|method|metric - const key = rank+"|"+x_method_select.value+"|"+x_sort_select.value.replace("metric|",""); + const key = rank+"|"+x_sort_select.value.replace("cluster|",""); sorted_factors = dict_d_hcluster_x[key]; }else{ // Sorting @@ -477,21 +502,16 @@ def link_heatmap_widgets(ele, y_dendro_callback = CustomJS( args=dict(rank_select=ele["heatmap"]["wid"]["rank_select"], y_sort_select=ele["heatmap"]["wid"]["y_sort_select"], - y_method_select=ele["heatmap"]["wid"]["y_method_select"], cds_p_dendro_y=cds_p_dendro_y, dict_d_dedro_y=dict_d_dedro_y), code=''' - if (y_sort_select.value.startsWith("metric|")){ - const key = rank_select.value+"|"+y_method_select.value+"|"+y_sort_select.value.replace("metric|",""); + if (y_sort_select.value.startsWith("cluster|")){ + const key = rank_select.value+"|"+y_sort_select.value.replace("cluster|",""); cds_p_dendro_y.data = {"x": dict_d_dedro_y[key+"|x"], "y": dict_d_dedro_y[key+"|y"], "c": dict_d_dedro_y[key+"|c"]}; - // Enable method select - y_method_select.disabled=false; }else{ cds_p_dendro_y.data = {"x": [], "y": [], "c": []}; - // Disable method select - y_method_select.disabled=true; } ''') @@ -500,7 +520,6 @@ def link_heatmap_widgets(ele, cds_d_samples=cds_d_samples, cds_d_metadata=cds_d_metadata, rank_select=ele["heatmap"]["wid"]["rank_select"], - y_method_select=ele["heatmap"]["wid"]["y_method_select"], y_sort_select=ele["heatmap"]["wid"]["y_sort_select"], dict_d_hcluster_y=dict_d_hcluster_y), code=''' @@ -508,10 +527,10 @@ def link_heatmap_widgets(ele, if (y_sort_select.value=="none"){ // None sorted_factors = dict_d_hcluster_y["default"]; - }else if (y_sort_select.value.startsWith("metric|")){ + }else if (y_sort_select.value.startsWith("cluster|")){ // Clustering // Get sorted elements based on rank|method|metric - const key = rank_select.value+"|"+y_method_select.value+"|"+y_sort_select.value.replace("metric|",""); + const key = rank_select.value+"|"+y_sort_select.value.replace("cluster|",""); sorted_factors = dict_d_hcluster_y[key]; }else{ // Sorting @@ -551,9 +570,7 @@ def link_heatmap_widgets(ele, ele["heatmap"]["wid"]["toggle_labels"].js_on_click(toggle_labels_callback) ele["heatmap"]["wid"]["rank_select"].js_on_change('value', x_select_callback, x_dendro_callback, y_select_callback, y_dendro_callback) - ele["heatmap"]["wid"]["x_method_select"].js_on_change('value', x_select_callback, x_dendro_callback) ele["heatmap"]["wid"]["x_sort_select"].js_on_change('value', x_select_callback, x_dendro_callback) - ele["heatmap"]["wid"]["y_method_select"].js_on_change('value', y_select_callback, y_dendro_callback) ele["heatmap"]["wid"]["y_sort_select"].js_on_change('value', y_select_callback, y_dendro_callback) diff --git a/grimer/grimer.py b/grimer/grimer.py index 46d053a..9389d0a 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -398,7 +398,8 @@ def main(): min_obs_perc, max_total_count, cds_p_mgnify, - dict_d_refs) + dict_d_refs, + dict_d_taxname) link_heatmap_widgets(ele, cds_d_samples, diff --git a/grimer/layout.py b/grimer/layout.py index 75aefcb..a229f94 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -90,16 +90,9 @@ def make_layout(ele, sizes, version, logo_path, title): ele["heatmap"]["wid"]["toggle_labels"], sizing_mode="stretch_height", width=300), - column(row(ele["heatmap"]["wid"]["x_sort_select"], - ele["heatmap"]["wid"]["y_sort_select"], - sizing_mode="stretch_width"), - row(ele["heatmap"]["wid"]["x_method_select"], - ele["heatmap"]["wid"]["y_method_select"], - sizing_mode="stretch_width"), - #row(ele["heatmap"]["wid"]["x_group_select"], - # Spacer(), - # sizing_mode="stretch_width"), - sizing_mode="stretch_width"), + row(ele["heatmap"]["wid"]["x_sort_select"], + ele["heatmap"]["wid"]["y_sort_select"], + sizing_mode="stretch_width"), column(ele["metadata"]["wid"]["metadata_multiselect"], sizing_mode="stretch_height", width=300)) diff --git a/grimer/plots.py b/grimer/plots.py index c248525..8ceb488 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -712,8 +712,13 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name rank_select = Select(title="Taxonomic rank:", value=ranks[0], options=ranks) + cluster_options = [] + for lmetric in linkage_metrics: + for lmethod in linkage_methods: + cluster_options.append(("cluster|" + lmethod + "|" + lmetric, lmethod + "/" + lmetric)) + x_sort_options = {} - x_sort_options["Clustering Metric"] = [("metric|" + lm, lm) for lm in linkage_metrics] + x_sort_options["Clustering Method/Metric"] = cluster_options x_sort_options["Default order"] = [("none", "none"), ("counts", "counts"), ("observations", "observations")] x_sort_options["Sort by References"] = [("annot|" + r, r) for r in reference_names] if controls_names: @@ -722,9 +727,8 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name x_sort_options["Sort by DECONTAM"] = [("annot|decontam", "decontam")] y_sort_options = {} - y_sort_options["Clustering Metric"] = [("metric|" + lm, lm) for lm in linkage_metrics] + y_sort_options["Clustering Method/Metric"] = cluster_options y_sort_options["Default order"] = [("none", "none"), ("counts", "counts"), ("samples", "samples")] - if metadata: numeric_md_data = metadata.get_data(metadata_type="numeric").columns.to_list() if numeric_md_data: @@ -734,10 +738,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name y_sort_options["Sort by Categorical Metadata"] = [("metadata_cat|" + md, md) for md in categorical_md_data] x_sort_select = Select(title="Observation cluster/sort:", value="none", options=x_sort_options) - x_method_select = Select(title="Observation clustering method:", value=linkage_methods[0], options=linkage_methods, disabled=True) - y_sort_select = Select(title="Sample cluster/sort:", value="none", options=y_sort_options) - y_method_select = Select(title="Sample clustering method:", value=linkage_methods[0], options=linkage_methods, disabled=True) toggle_labels = CheckboxGroup(labels=["Show/Hide observations labels", "Show/Hide samples labels"], active=[]) @@ -754,9 +755,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name """ return {"rank_select": rank_select, - "x_method_select": x_method_select, "x_sort_select": x_sort_select, - "y_method_select": y_method_select, "y_sort_select": y_sort_select, "toggle_labels": toggle_labels, "help_button": help_button(title="Heatmap/Clustering", text=help_text)} From 009fc450fec8a807e7d47aee39591cb4c789e986 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 4 Feb 2022 15:49:55 +0100 Subject: [PATCH 26/50] config class, mgnify download and grimer-mgnify --- grimer-mgnify.py | 39 ++++++ grimer/config.py | 63 ++++++++++ grimer/grimer.py | 73 ++--------- scripts/mgnify_download.py | 241 ++++++++++++++++++++----------------- 4 files changed, 244 insertions(+), 172 deletions(-) create mode 100755 grimer-mgnify.py create mode 100644 grimer/config.py diff --git a/grimer-mgnify.py b/grimer-mgnify.py new file mode 100755 index 0000000..9704a71 --- /dev/null +++ b/grimer-mgnify.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python3 + +import scripts.mgnify_download +import grimer.grimer +import argparse +import os +import glob + +parser = argparse.ArgumentParser(description='grimer-mgnify') +parser.add_argument('-i', '--mgnify-study-accession', required=True, type=str, help="MGnify study accession (e.g. MGYS00002462)") +parser.add_argument('-g', '--grimer-params', type=str, help="Extra params for grimer") +parser.add_argument('-o', '--output-prefix', type=str, help="Output prefix for files and report") +args = parser.parse_args() + +if args.output_prefix: + prefix = args.output_prefix +else: + prefix = args.mgnify_study_accession + +# download files +print("Downloading files for study accession " + args.mgnify_study_accession) +scripts.mgnify_download.main(['-i', args.mgnify_study_accession, '-o', prefix, '-v']) + +files = filter(os.path.isfile, glob.glob(prefix + '*taxonomy_abundances*')) +# Sort files by size ASC +files = sorted(files, key=lambda x: os.stat(x).st_size) +md = glob.glob(prefix + '*_metadata.tsv*') + +grimer.grimer.main(["-i", files[-1], + "-m", md[-1], + "-c", 'config/default.yaml', + "-f", ";", + "--obs-replace", "^.+__", "", "_", " ", + "-r", "superkingdom", "kingdom", "phylum", "class", "order", "family", "genus", "species", + "-t", "ncbi", + "-o", prefix + ".html", + "--title", "MGnify study accession " + args.mgnify_study_accession, + *args.grimer_params.split(" ") + ]) diff --git a/grimer/config.py b/grimer/config.py new file mode 100644 index 0000000..8a99ab1 --- /dev/null +++ b/grimer/config.py @@ -0,0 +1,63 @@ + #!/usr/bin/env python3 +import argparse +import sys +from scipy.spatial.distance import _METRICS_NAMES +from scipy.cluster.hierarchy import _LINKAGE_METHODS + + +class Config: + + version = "1.0.0-alpha1" + default_rank_name = "default" + + def __new__(self, argv=None): + + parser = argparse.ArgumentParser(description='grimer') + + parser.add_argument('-i', '--input-file', required=True, type=str, help="Main input table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --tranpose if your file is reversed). First column and first row are used as headers.") + parser.add_argument('-c', '--config', required=True, type=str, help="Configuration file") + parser.add_argument('-m', '--metadata', type=str, help="Input metadata file in simple tabular format. Sample identifiers will be matched with ones provided by --input-table. QIIME 2 metadata format is also accepted, with categorical and numerical fields.") + parser.add_argument('-t', '--tax', type=str, default=None, help="Define taxonomy to use. By default, do not use any taxonomy.", choices=["ncbi", "gtdb", "silva", "greengenes", "ott"]) + parser.add_argument('-b', '--tax-files', nargs="*", type=str, default=None, help="Taxonomy files. If not provided, will automatically be downloaded.") + parser.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") + parser.add_argument('-r', '--ranks', nargs="*", default=[Config.default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + Config.default_rank_name + "' to use entries from the table directly. Default: " + Config.default_rank_name) + parser.add_argument('-l', '--title', type=str, default="", help="Title to display on the header of the report.") + parser.add_argument('-o', '--output-html', type=str, default="output.html", help="File to output report. Default: output.html") + parser.add_argument('--full-offline', default=False, action='store_true', help="Embed javascript library in the output file. File will be around 1.5MB bigger but also work without internet connection. That way your report will live forever.") + + table_group = parser.add_argument_group('Table options') + table_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hiearchical multi-level table where the observations headers are separated by the indicated separator characther (usually ';' or '|')") + table_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table (if samples are listed on columns and observations on rows)") + table_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.") + table_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on table observations labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") + table_group.add_argument('--sample-replace', nargs="*", type=str, default=[], help="Replace values on table sample labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") + + filter_group = parser.add_argument_group('Observation filter options') + filter_group.add_argument('--min-frequency', type=float, help="Define minimum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") + filter_group.add_argument('--max-frequency', type=float, help="Define maximum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") + filter_group.add_argument('--min-count', type=float, help="Define minimum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") + filter_group.add_argument('--max-count', type=float, help="Define maximum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") + + overview_group = parser.add_argument_group('Overview options') + overview_group.add_argument('-g', '--mgnify', default=False, action='store_true', help="Use MGNify data") + overview_group.add_argument('-d', '--decontam', default=False, action='store_true', help="Run DECONTAM") + + heatmap_group = parser.add_argument_group('Heatmap and clustering options') + heatmap_group.add_argument('-a', '--transformation', type=str, default="log", help="none (counts), norm (percentage), log (log10), clr (centre log ratio). Default: log") + heatmap_group.add_argument('-e', '--metadata-cols', type=int, default=5, help="How many metadata cols to show on the heatmap. Higher values makes plot slower to navigate.") + heatmap_group.add_argument('--optimal-ordering', default=False, action='store_true', help="Activate optimal_ordering on linkage, takes longer for large number of samples.") + heatmap_group.add_argument('--show-zeros', default=False, action='store_true', help="Do not skip zeros on heatmap. File will be bigger and iteraction with heatmap slower.") + heatmap_group.add_argument('--linkage-methods', type=str, nargs="*", default=["complete"], choices=list(_LINKAGE_METHODS)) + heatmap_group.add_argument('--linkage-metrics', type=str, nargs="*", default=["euclidean"], choices=_METRICS_NAMES) + heatmap_group.add_argument('--skip-dendrogram', default=False, action='store_true', help="Disable dendogram. Will create smaller files.") + + correlation_group = parser.add_argument_group('Correlation options') + correlation_group.add_argument('-x', '--top-obs-corr', type=int, default=20, help="Top abundant observations to build the correlationn matrix, based on the avg. percentage counts/sample. 0 for all") + + bars_group = parser.add_argument_group('Bars options') + bars_group.add_argument('-j', '--top-obs-bars', type=int, default=20, help="Top abundant observations to show in the bars.") + + parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + Config.version) + parser.add_argument('-D', '--debug', default=False, action='store_true', help=argparse.SUPPRESS) + + return parser.parse_args(argv) diff --git a/grimer/grimer.py b/grimer/grimer.py index 9389d0a..6836070 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -11,6 +11,7 @@ from grimer.mgnify import MGnify from grimer.callbacks import * from grimer.cds import * +from grimer.config import Config from grimer.layout import * from grimer.plots import * from grimer.utils import * @@ -22,65 +23,11 @@ from bokeh.io import save from bokeh.plotting import output_file -# Scipy -from scipy.spatial.distance import _METRICS_NAMES -from scipy.cluster.hierarchy import _LINKAGE_METHODS - - -def main(): - - default_rank_name = "default" - - version = "1.0.0-alpha1" - parser = argparse.ArgumentParser(description='grimer') - parser.add_argument('-i', '--input-file', required=True, type=str, help="Main input table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --tranpose if your file is reversed). First column and first row are used as headers.") - parser.add_argument('-c', '--config', required=True, type=str, help="Configuration file") - parser.add_argument('-m', '--metadata', type=str, help="Input metadata file in simple tabular format. Sample identifiers will be matched with ones provided by --input-table. QIIME 2 metadata format is also accepted, with categorical and numerical fields.") - parser.add_argument('-t', '--tax', type=str, default=None, help="Define taxonomy to use. By default, do not use any taxonomy.", choices=["ncbi", "gtdb", "silva", "greengenes", "ott"]) - parser.add_argument('-b', '--tax-files', nargs="*", type=str, default=None, help="Taxonomy files. If not provided, will automatically be downloaded.") - parser.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") - parser.add_argument('-r', '--ranks', nargs="*", default=[default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + default_rank_name + "' to use entries from the table directly. Default: " + default_rank_name) - parser.add_argument('-l', '--title', type=str, default="", help="Title to display on the header of the report.") - parser.add_argument('-o', '--output-html', type=str, default="output.html", help="File to output report. Default: output.html") - parser.add_argument('--full-offline', default=False, action='store_true', help="Embed javascript library in the output file. File will be around 1.5MB bigger but also work without internet connection. That way your report will live forever.") - - table_group = parser.add_argument_group('Table options') - table_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hiearchical multi-level table where the observations headers are separated by the indicated separator characther (usually ';' or '|')") - table_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table (if samples are listed on columns and observations on rows)") - table_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.") - table_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on table observations labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") - table_group.add_argument('--sample-replace', nargs="*", type=str, default=[], help="Replace values on table sample labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") - - filter_group = parser.add_argument_group('Observation filter options') - filter_group.add_argument('--min-frequency', type=float, help="Define minimum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") - filter_group.add_argument('--max-frequency', type=float, help="Define maximum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") - filter_group.add_argument('--min-count', type=float, help="Define minimum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") - filter_group.add_argument('--max-count', type=float, help="Define maximum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") - - overview_group = parser.add_argument_group('Overview options') - overview_group.add_argument('-g', '--mgnify', default=False, action='store_true', help="Use MGNify data") - overview_group.add_argument('-d', '--decontam', default=False, action='store_true', help="Run DECONTAM") - - heatmap_group = parser.add_argument_group('Heatmap and clustering options') - heatmap_group.add_argument('-a', '--transformation', type=str, default="log", help="none (counts), norm (percentage), log (log10), clr (centre log ratio). Default: log") - heatmap_group.add_argument('-e', '--metadata-cols', type=int, default=5, help="How many metadata cols to show on the heatmap. Higher values makes plot slower to navigate.") - heatmap_group.add_argument('--optimal-ordering', default=False, action='store_true', help="Activate optimal_ordering on linkage, takes longer for large number of samples.") - heatmap_group.add_argument('--show-zeros', default=False, action='store_true', help="Do not skip zeros on heatmap. File will be bigger and iteraction with heatmap slower.") - heatmap_group.add_argument('--linkage-methods', type=str, nargs="*", default=["complete"], choices=list(_LINKAGE_METHODS)) - heatmap_group.add_argument('--linkage-metrics', type=str, nargs="*", default=["euclidean"], choices=_METRICS_NAMES) - heatmap_group.add_argument('--skip-dendrogram', default=False, action='store_true', help="Disable dendogram. Will create smaller files.") - - correlation_group = parser.add_argument_group('Correlation options') - correlation_group.add_argument('-x', '--top-obs-corr', type=int, default=20, help="Top abundant observations to build the correlationn matrix, based on the avg. percentage counts/sample. 0 for all") - - bars_group = parser.add_argument_group('Bars options') - bars_group.add_argument('-j', '--top-obs-bars', type=int, default=20, help="Top abundant observations to show in the bars.") - - parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + version) - parser.add_argument('-D', '--debug', default=False, action='store_true', help=argparse.SUPPRESS) - args = parser.parse_args() - - print_logo_cli(version) + +def main(argv=sys.argv[1:]): + + args = Config(argv) + print_logo_cli(Config.version) global _debug _debug = args.debug @@ -114,7 +61,7 @@ def main(): # Table of counts print_log("- Parsing table") if not args.ranks: - args.ranks = [default_rank_name] + args.ranks = [Config.default_rank_name] if args.input_file.endswith(".biom"): args.level_separator = ";" @@ -124,7 +71,7 @@ def main(): if args.level_separator: ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace) else: - ranked_tables, lineage = parse_single_table(table_df, args.ranks, tax, default_rank_name) + ranked_tables, lineage = parse_single_table(table_df, args.ranks, tax, Config.default_rank_name) if not ranked_tables: print_log("Could not parse input table") @@ -208,7 +155,7 @@ def main(): # Mgnify if args.mgnify and "mgnify" in cfg["external"]: print_log("- Parsing MGNify") - mgnify = MGnify(cfg["external"]["mgnify"], ranks=table.ranks() if args.ranks != [default_rank_name] else []) + mgnify = MGnify(cfg["external"]["mgnify"], ranks=table.ranks() if args.ranks != [Config.default_rank_name] else []) if tax: mgnify.update_taxids(update_tax_nodes([tuple(x) for x in mgnify.data[["rank", "taxa"]].to_numpy()], tax)) print_log("") @@ -436,7 +383,7 @@ def main(): script_dir, _ = os.path.split(__file__) logo_path = os.path.join(script_dir, "img", "logo.png") - final_layout = make_layout(ele, sizes, version, logo_path, args.title) + final_layout = make_layout(ele, sizes, Config.version, logo_path, args.title) template = include_scripts({os.path.join(script_dir, "js", "func.js"): "script", os.path.join(script_dir, "js", "popup.js"): "script", diff --git a/scripts/mgnify_download.py b/scripts/mgnify_download.py index 5e21d85..e84269b 100755 --- a/scripts/mgnify_download.py +++ b/scripts/mgnify_download.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import argparse import pandas as pd import sys import os @@ -15,115 +16,137 @@ Usage: ./mgnify_download.py study_accession [output_folder] Example single: ./mgnify_download.py MGYS00000554 -Example dump: seq -f "MGYS%08g" 1 5724 | xargs -P 8 -I {} ./mgnify_download.py {} mgnify_dump_20210408/ > mgnify_dump_20210408.log 2>&1 & +Example dump: seq -f "MGYS%08g" 1 5724 | xargs -P 8 -I {} ./mgnify_download.py -i {} -v -g -o mgnify_dump_20210408/ > mgnify_dump_20210408.log 2>&1 & """ -API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/' - -study_accession = sys.argv[1] -output_folder = sys.argv[2]+"/" if len(sys.argv) == 3 else "./" -prefix = output_folder+study_accession -gz = True - -md_file = prefix + "_metadata.tsv" -out_file = prefix + ".pkl" - -if gz: - out_file = out_file + ".gz" - md_file = md_file + ".gz" - -# Check if files exists and skip -tax_files = glob(prefix + "*_taxonomy_abundances_*") -if tax_files and os.path.isfile(out_file) and os.path.isfile(md_file): - print(study_accession, "Files found, skipping") - sys.exit(1) - -with Session(API_BASE) as s: - # Get main study resource - try: - study = s.get('studies', study_accession).resource - print(study.accession, "SAMPLES:"+str(study.samples_count), sep="\t",end="\t") - except: - print(study_accession, "Error: Study not found") - sys.exit(1) - - # Save study info as a dict in a pkl file - f = gzip.open(out_file, 'wb') if gz else open(out_file, "wb") - pickle.dump(study.json, file=f) - f.close() - - # Get all taxonomic tables for the highest version of the pipeline - highest_version = 0 - table_version = {} - for download in study.downloads: - label = download.description.label - #["Taxonomic assignments", - #"Taxonomic assignments SSU", - #"Taxonomic assignments LSU" - #"Taxonomic assignments UNITE", - #"Taxonomic assignments ITSoneDB"] - if "Taxonomic assignments" in label: - version = float(download.pipeline.id) - if version not in table_version: - table_version[version] = [] - table_version[version].append(download.url) - if version > highest_version: - highest_version = version - - if not table_version: - print("Error: No taxonomic assignments for this study to download") - sys.exit(1) + +def main(argv=sys.argv[1:]): + + API_BASE = 'https://www.ebi.ac.uk/metagenomics/api/latest/' + + parser = argparse.ArgumentParser(description='grimer-download-mgnify') + parser.add_argument('-i', '--study-accession', required=True, type=str, help="MGnify study accession (e.g. MGYS00002462)") + parser.add_argument('-g', '--gzip', default=False, action='store_true', help="Gzip downloaded files") + parser.add_argument('-v', '--verbose', default=False, action='store_true', help="Verbose output") + parser.add_argument('-o', '--output-prefix', type=str, help="Output prefix for downloaded files. Default: --study-accession") + args = parser.parse_args(argv) + + study_accession = args.study_accession + if args.output_prefix: + prefix = args.output_prefix + else: + prefix = study_accession + gz = args.gzip + + md_file = prefix + "_metadata.tsv" + out_file = prefix + ".pkl" + if gz: + out_file = out_file + ".gz" + md_file = md_file + ".gz" + + # Check if files exists and skip + tax_files = glob(prefix + "*_taxonomy_abundances_*") + if tax_files and os.path.isfile(out_file) and os.path.isfile(md_file): + print(study_accession, "Warning: files already exist ") + return + + with Session(API_BASE) as s: + # Get main study resource + try: + study = s.get('studies', study_accession).resource + if args.verbose: + print(study.accession, "SAMPLES:" + str(study.samples_count), sep="\t", end="\t") + except: + print(study_accession, "Error: Study accession not found") + sys.exit(1) + + # Save study info as a dict in a pkl file + f = gzip.open(out_file, 'wb') if gz else open(out_file, "wb") + pickle.dump(study.json, file=f) + f.close() + + # Get all taxonomic tables for the highest version of the pipeline + highest_version = 0 + table_version = {} + for download in study.downloads: + label = download.description.label + #["Taxonomic assignments", + #"Taxonomic assignments SSU", + #"Taxonomic assignments LSU" + #"Taxonomic assignments UNITE", + #"Taxonomic assignments ITSoneDB"] + if "Taxonomic assignments" in label: + version = float(download.pipeline.id) + if version not in table_version: + table_version[version] = [] + table_version[version].append(download.url) + if version > highest_version: + highest_version = version + + if not table_version: + print("Error: No taxonomic assignments for this study to download") + sys.exit(1) + else: + table_urls = table_version[highest_version] + + # Get all available samples in one go and collect metadata + params = { + 'study_accession': study_accession, + 'page_size': study.samples_count, + } + fltr = Filter(urlencode(params)) + + metadata = {} + for sample in s.iterate('samples', fltr): + # TODO: how to access runs faster, sample.runs is too slow + #nruns += len(sample.runs) + metadata[sample.accession] = {} + for md in sample.sample_metadata: + metadata[sample.accession][md["key"]] = md["value"] + # Add sample description, name and name as metadata + metadata[sample.accession]['sample-desc'] = sample.sample_desc + metadata[sample.accession]['sample-name'] = sample.sample_name + + # Get link sample accession, run accession + # TODO treat multiple runs per sample + run_sample_accesion = {} + try: + for run in s.iterate('runs', fltr): + run_sample_accesion[run.sample.id] = run.id + except: + print("Error: Could not retrieve run accession", sep="\t", end="\t") + + # Write metadata + md_df = pd.DataFrame.from_dict(metadata).T + if run_sample_accesion: + mapped_accessions = md_df.index.isin(run_sample_accesion.keys()) + if args.verbose: + print("MAPPED:" + str(sum(mapped_accessions)), sep="\t", end="\t") + md_df.index = md_df.index.map(lambda x: run_sample_accesion[x] if x in run_sample_accesion else x) else: - table_urls = table_version[highest_version] - - # Get all available samples in one go and collect metadata - params = { - 'study_accession': study_accession, - 'page_size': study.samples_count, - } - fltr = Filter(urlencode(params)) - - metadata = {} - for sample in s.iterate('samples', fltr): - # TODO: how to access runs faster, sample.runs is too slow - #nruns += len(sample.runs) - metadata[sample.accession] = {} - for md in sample.sample_metadata: - metadata[sample.accession][md["key"]] = md["value"] - # Add sample description, name and name as metadata - metadata[sample.accession]['sample-desc'] = sample.sample_desc - metadata[sample.accession]['sample-name'] = sample.sample_name - - # Get link sample accession, run accession - # TODO treat multiple runs per sample - run_sample_accesion = {} - try: - for run in s.iterate('runs', fltr): - run_sample_accesion[run.sample.id] = run.id - except: - print("Error: Could not retrieve run accession", sep="\t", end="\t") - -# Write metadata -md_df = pd.DataFrame.from_dict(metadata).T -if run_sample_accesion: - mapped_accessions = md_df.index.isin(run_sample_accesion.keys()) - print("MAPPED:" + str(sum(mapped_accessions)), sep="\t", end="\t") - md_df.index = md_df.index.map(lambda x: run_sample_accesion[x] if x in run_sample_accesion else x) -else: - print("Error: No mapping between accessions of samples and metadata", sep="\t", end="\t") - -print("METADATA:" + str(md_df.shape[1]), sep="\t", end="\t") -md_df.to_csv(md_file, compression="gzip" if gz else None, sep="\t") - -# Read and write tables -for table_url in table_urls: - try: - t = pd.read_table(table_url) - print("OK:" + table_url, end=";") - # Print original - filename = prefix + "_" + os.path.basename(table_url) - t.to_csv(filename if not gz else filename+".gz", compression="gzip" if gz else None, sep="\t", index=False) - except: - print("INVALID:" + table_url, end=";") - -print() + if args.verbose: + print("Warning: No mapping between accessions of samples and metadata", sep="\t", end="\t") + + if args.verbose: + print("METADATA:" + str(md_df.shape[1]), sep="\t", end="\t") + md_df.to_csv(md_file, compression="gzip" if gz else None, sep="\t") + + # Read and write tables + for table_url in table_urls: + try: + t = pd.read_table(table_url) + if args.verbose: + print("OK:" + table_url, end=";") + # Print original + filename = prefix + "_" + os.path.basename(table_url) + t.to_csv(filename if not gz else filename + ".gz", compression="gzip" if gz else None, sep="\t", index=False) + except: + if args.verbose: + print("INVALID:" + table_url, end=";") + + if args.verbose: + print() + + +if __name__ == "__main__": + main() From a3872be5c2e4aea07ad3b0dc1577526dc9d77e0a Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Mon, 7 Feb 2022 11:33:57 +0100 Subject: [PATCH 27/50] fix biom parsing --- grimer/callbacks.py | 4 +--- grimer/utils.py | 10 ++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 5f8527b..6367d90 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -230,6 +230,7 @@ def link_obstable_samplebars(ele, code=''' // selected row var row = cb_obj.indices[0]; + const name = cds_p_obstable.data['col|name'][row]; const rank = cds_p_obstable.data['col|rank'][row]; const taxid = cds_p_obstable.data['index'][row]; @@ -248,9 +249,7 @@ def link_obstable_samplebars(ele, text+=rank; text+="\\n"; - var lineage = ""; - for(let r = 0; r < active_ranks.length; r++){ var obs_lin = cds_p_obstable.data["tax|" + active_ranks[r]][row]; if(taxid!=name){ @@ -449,7 +448,6 @@ def link_heatmap_widgets(ele, code=''' const rank = rank_select.value; var sorted_factors = []; - console.log(x_sort_select.value); if (x_sort_select.value=="none"){ // None sorted_factors = dict_d_hcluster_x["default|" + rank]; diff --git a/grimer/utils.py b/grimer/utils.py index dd6b012..abb4d2c 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -14,7 +14,7 @@ from bokeh.palettes import Blues, Category10, Category20, Colorblind, Dark2, linear_palette, Magma256, Reds, Turbo256 #biom -from biom import parse_table as parse_table_biom +import biom # scikit-bio from skbio.stats.composition import clr @@ -26,9 +26,7 @@ def parse_input_table(input_file, unassigned_header, transpose, sample_replace): if input_file.endswith(".biom"): - with open(input_file, encoding="utf8", errors='ignore') as f: - table_df = parse_table_biom(f).to_dataframe(dense=True) - # biom convert -i feature-table.biom -o feature-table.biom.tsv --to-tsv + table_df = biom.load_table(input_file).to_dataframe(dense=True) else: # Default input_file: index=observations, columns=samples # table_df should have samples on indices and observations on columns @@ -39,7 +37,7 @@ def parse_input_table(input_file, unassigned_header, transpose, sample_replace): table_df = table_df.transpose() # Remove header on rows - table_df.index.names = [None] + table_df.index.name = None # Replace text on sample labels if sample_replace: @@ -208,9 +206,9 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): # ranks_df and table_df.T have the same shape ranked_table_df = pd.concat([ranks_df[r], table_df.T.reset_index(drop=True)], axis=1) ranked_tables[r] = ranked_table_df.groupby([r], dropna=True).sum().T + ranked_tables[r].columns.name = None lineage = ranks_df - return ranked_tables, lineage From 67faab74f6bbf56769a84fec5e38c9d5bc808545 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 8 Feb 2022 20:03:30 +0100 Subject: [PATCH 28/50] new contaminants --- files/contaminants.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/files/contaminants.yml b/files/contaminants.yml index 6a57d09..3738a6b 100644 --- a/files/contaminants.yml +++ b/files/contaminants.yml @@ -44,6 +44,12 @@ "2017 Salter, S.J. et al.": url: "http://doi.org/10.1371/journal.pntd.0005975" ids: [50709, 299566, 1375, 2040, 507, 31988, 165779, 161492, 150247, 92793, 374, 55080, 1696, 41275, 369926, 32008, 194, 2717, 75, 10, 59732, 1716, 37914, 231454, 423604, 212791, 117563, 963, 1004300, 682522, 1357, 149698, 906, 68287, 407, 33882, 1839, 528, 376469, 84567, 335058, 28100, 838, 286, 83618, 48736, 379, 1835, 45669, 22, 28453, 13687, 40323, 1054211, 13275, 33057, 157, 213484, 29465, 1827, 265, 1386] + "2019 Stinson, L.F. et al.": + url: "http://doi.org/10.1111/lam.13091" + ids: [561, 335058, 407, 13687, 407, 374, 165696, 222, 1716, 547, 48736, 1004302, 1827, 743, 1269, 204456, 106589, 1678] + "2002 Kulakov, L.A. et al.": + url: "http://doi.org/10.1128/AEM.68.4.1548-1555.2002" + ids: [329, 376, 239, 36773, 69392, 1785, 1409, 304, 28214, 294] "Common Viral contaminants": "2019 Asplund, M. et al.": url: "http://doi.org/10.1016/j.cmi.2019.04.028" @@ -58,3 +64,6 @@ "PRJNA168": url: "https://www.ncbi.nlm.nih.gov/genome/guide/human/" ids: [9606] + "2016 Czurda, S. et al.": + url: "https://doi.org/10.1128/JCM.02112-15" + ids: [1895944, 76775, 5308] From 7e099030547dc6a38a05605e9df0cfa45c5ffd51 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 9 Feb 2022 15:25:12 +0100 Subject: [PATCH 29/50] more contaminants --- files/contaminants.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/files/contaminants.yml b/files/contaminants.yml index 3738a6b..4b77536 100644 --- a/files/contaminants.yml +++ b/files/contaminants.yml @@ -4,7 +4,7 @@ ids: [407, 335058, 504481, 1747, 40324, 1033, 38304, 28037, 470, 29448, 1828, 92793, 75, 375, 180282, 851, 301302, 853, 816, 33870, 85698, 87883, 147207, 68909, 1043493, 293256, 1134405, 410, 321895, 432308, 1416628, 1314, 1343, 69359] "2018 Kirstahler, P. et al.": url: "http://doi.org/10.1038/s41598-018-22416-4" - ids: [1747, 40324, 470, 29448, 294, 1828, 39491, 40214, 28090, 134533, 108981, 202956, 239935, 28117, 28116, 818, 820, 161879, 33011, 80866, 853, 823, 821, 296, 303, 34073, 1050843, 1055192, 106648, 1075768, 1076, 1112209, 1131812, 1150298, 1159870, 1160721, 1198452, 1217692, 1276755, 1320556, 134534, 1353941, 136273, 1435036, 147645, 1492737, 1492738, 1497615, 1504823, 1509403, 1519439, 1538644, 1619232, 162426, 1646498, 165179, 1654716, 1665556, 1678129, 169292, 1706231, 1714344, 172088, 1736272, 1736280, 1736296, 1736316, 1736528, 1736532, 1740090, 1833, 1835254, 192843, 202952, 202954, 211589, 216465, 245, 246602, 246787, 247, 266749, 2702, 2736, 285, 28901, 29536, 310297, 310298, 33010, 34062, 346179, 362413, 362418, 370974, 38303, 387661, 40520, 418240, 46506, 47920, 503361, 50340, 52133, 529884, 53412, 55197, 55508, 5665, 64974, 70863, 75659, 756892, 76773, 80878, 80882, 86182, 96345, 986, 989370, 991, 99158] + ids: [1747, 40324, 470, 29448, 294, 1828, 39491, 40214, 28090, 134533, 108981, 202956, 239935, 28117, 28116, 818, 820, 161879, 33011, 80866, 853, 823, 821, 296, 303, 34073, 2559073, 1055192, 106648, 1075768, 1076, 1112209, 1131812, 1150298, 1159870, 1160721, 1198452, 1217692, 1276755, 1320556, 134534, 1353941, 136273, 1435036, 147645, 1492737, 1492738, 1497615, 1504823, 1509403, 1519439, 1538644, 1619232, 162426, 1646498, 165179, 1654716, 1665556, 1678129, 169292, 1706231, 1714344, 172088, 1736272, 1736280, 1736296, 1736316, 1736528, 1736532, 1740090, 1833, 1835254, 192843, 202952, 202954, 211589, 216465, 245, 246602, 246787, 247, 266749, 2702, 2736, 285, 28901, 29536, 310297, 310298, 33010, 34062, 346179, 362413, 362418, 370974, 38303, 387661, 40520, 418240, 46506, 47920, 503361, 50340, 52133, 529884, 53412, 55197, 55508, 5665, 64974, 70863, 75659, 756892, 76773, 80878, 80882, 86182, 96345, 986, 989370, 991, 99158] "2015 Jervis-Bardy, J. et al.": url: "http://doi.org/10.1186/s40168-015-0083-8" ids: [286, 48736, 59732, 335058, 41275, 28100, 34072] @@ -44,16 +44,19 @@ "2017 Salter, S.J. et al.": url: "http://doi.org/10.1371/journal.pntd.0005975" ids: [50709, 299566, 1375, 2040, 507, 31988, 165779, 161492, 150247, 92793, 374, 55080, 1696, 41275, 369926, 32008, 194, 2717, 75, 10, 59732, 1716, 37914, 231454, 423604, 212791, 117563, 963, 1004300, 682522, 1357, 149698, 906, 68287, 407, 33882, 1839, 528, 376469, 84567, 335058, 28100, 838, 286, 83618, 48736, 379, 1835, 45669, 22, 28453, 13687, 40323, 1054211, 13275, 33057, 157, 213484, 29465, 1827, 265, 1386] + "2018 Stinson, L.F. et al.": + url: "http://doi.org/10.3389/fmicb.2018.00270" + ids: [1696, 1716, 43668, 37914, 1269, 32207, 1743, 836, 838, 1016, 308865, 1386, 2755, 1279, 66831, 1350, 1578, 1301, 29465, 374, 407, 434, 165696, 13687, 283, 80865, 93681, 48736, 570, 713, 469, 212791, 286, 40323] "2019 Stinson, L.F. et al.": url: "http://doi.org/10.1111/lam.13091" - ids: [561, 335058, 407, 13687, 407, 374, 165696, 222, 1716, 547, 48736, 1004302, 1827, 743, 1269, 204456, 106589, 1678] + ids: [561, 335058, 407, 13687, 407, 374, 165696, 222, 1716, 547, 48736, 1004302, 1827, 1743, 1269, 204456, 106589, 1678] "2002 Kulakov, L.A. et al.": url: "http://doi.org/10.1128/AEM.68.4.1548-1555.2002" ids: [329, 376, 239, 36773, 69392, 1785, 1409, 304, 28214, 294] "Common Viral contaminants": "2019 Asplund, M. et al.": url: "http://doi.org/10.1016/j.cmi.2019.04.028" - ids: [12071, 742919, 11103, 31647, 12461, 10298, 10376, 10359, 11676, 129951, 10583, 31552, 10798, 11908, 585044, 518981, 1225745, 11620, 1891767, 493803, 11033, 159150, 35306, 68887, 11870, 11958, 11861, 11946, 11864, 363745, 363020, 242521, 11866, 11960, 31668, 31669, 31670, 11867, 11955, 11874, 11876, 11878, 11885, 36381, 11886, 11888, 269447, 269448, 11950, 11948, 1332312, 354090, 11884, 1352534, 1395610, 1395611, 1395612, 1395613, 1395614, 1395615, 1395616, 1395617, 1395618, 1395619, 1395620, 1341019, 11801, 11809, 1511763, 1394983, 697906, 1072204, 1148801, 1574422, 12104, 763552, 10264, 85708, 759804, 28344, 85506, 33747, 10345, 285986, 220638, 1154691, 185638, 1169627, 1045778, 185636, 72201, 345198, 176652, 1301280, 68347, 1618248, 1618254, 10288, 198112, 1454023, 1454024, 1454025, 1278278, 1278246, 1278252, 1278247, 1278248, 1278249, 1278250, 1278251, 399781, 1278255, 346932, 1278261, 1278263, 1278265, 1474867, 1379694, 1521385, 1521387, 1521389, 938081, 938082, 880162, 251749, 455370, 169864, 1379788, 1608440, 642253, 642255, 1224510, 1592207, 1592212, 1592083, 1592085, 1592086, 1592088, 1592093, 1592095, 1592096, 1592081, 1843761, 1519405, 1557033, 1608451, 664785, 1435438, 1170653, 40979, 12235, 12138, 11987, 51680, 12056, 146500, 554168, 212035, 1269028, 693272, 1420594, 1094892, 1128140, 1235314, 1128143, 1128151, 1128131, 1450746, 1461100, 181522, 1424633, 1010698, 1299317, 1450749, 1416631, 1128422, 1034806, 1592112, 1592113, 1592127, 938080, 1074214, 1519385, 1519387, 1519389, 1519390, 1519395, 1519396, 1519397, 186617, 1262072, 1407671, 743583, 340016, 745107, 745102, 745100, 1416009, 1187128, 889876, 760732, 1243183, 1229760, 1481186, 1505225, 1560342, 233894, 115987, 260149, 227470, 926067, 1127514, 1296654, 294382, 1486657, 1084719, 10756, 1486662, 1285382, 1497851, 1127515, 145579, 263375, 764562, 1133292, 1133022, 242527, 260373, 279280, 644524, 242861, 1132026, 1357714, 1197951, 1327981, 1327976, 1327979, 1327992, 1328030, 1327990, 1327980, 1327972, 1327982, 1327995, 1327983, 1327970, 1327971, 756279, 1327977, 1327993, 1328029, 1327975, 1327974, 1327985, 756280, 756282, 1527524, 1540094, 1042123, 541865, 1567016, 765765, 1176422, 1327037, 1162295, 1141135, 1141136, 335924, 536444, 929832, 682650, 1137745, 536473, 749413, 1477406, 1048515, 1048516, 1048517, 1048520, 1048521, 1537091, 1264700, 1609634, 1455074, 414970, 10863, 10864, 1222338, 1147148, 1237364, 1414766, 1977402, 948870, 1524881, 10665, 10760, 1147094, 1429767, 925983, 925984, 1527519, 1527506, 1229753, 1540097, 1540098, 1054461, 1391223, 294631, 1325731, 908819, 1458858, 1458842, 90963, 1536592, 1527515, 551895, 1129191, 139872, 201847, 287412, 1262517, 754044, 1385658, 1176423, 889949, 446529, 1034128, 1056830, 1089119, 1486472, 1034111, 205879, 1340709, 1567475, 1472912, 1204539, 1399915, 1283076, 1283077, 1168479, 1168478, 440250, 400567, 994601, 1465639, 889956, 445700, 444862, 536454, 445688, 444861, 1229794, 1229793, 1229792, 1229791, 1229790, 1229789, 1229786, 1229787, 1229788, 1229784, 1229782, 376758, 1498188, 504501, 504553, 1235647, 1235648, 1235649, 1235650, 1235653, 1235654, 1235655, 1235656, 1235657, 877240, 754052, 1316739, 347326, 1235689, 31535, 757342, 582345, 1462581, 386793, 1204517, 347327, 1335230, 743813, 1348912, 1327964, 270673, 188350, 1541891, 169683, 998086, 1500757, 1458843, 1129146, 1279082, 1114179, 1548900, 1231048, 1548901, 1449437, 1548918, 1476390, 462590, 754048, 948071, 1481785, 1417599, 1131316, 691965, 136084, 754067, 1161935, 1173749, 1173761, 1173759, 1173762, 590739, 1406795, 1141134, 1204529, 1540099, 1168549, 866889, 1458859, 1458860, 1458861, 10761, 754060, 1524882, 1357423, 373126, 1150991, 1195080, 320843, 55510, 1434319, 320850, 369581, 537874, 1208587, 1566990, 10732, 490913, 1526550, 1340810, 756277, 753084, 753085, 756275, 1026955, 1340812, 238854, 555387, 754042, 444860, 981335, 469660, 215796, 1478972, 1385659, 926697, 336724, 278008, 1211417, 271647, 754075, 573173, 573174, 979525, 979534, 1529058, 1283071, 573176, 1589298, 1076759, 1461743, 1150989, 754058, 754051, 929835, 1414739, 754072, 1524880, 194802, 1168281, 1204514, 1188795, 331278] + ids: [12071, 742919, 11103, 31647, 1678143, 10298, 10376, 10359, 11676, 129951, 10583, 31552, 10798, 11908, 585044, 518981, 1225745, 11620, 1891767, 493803, 11033, 159150, 35306, 68887, 11870, 11958, 11861, 11946, 11864, 363745, 363020, 242521, 11866, 11960, 31668, 31669, 31670, 11867, 11955, 11874, 11876, 11878, 11885, 36381, 11886, 11888, 269447, 269448, 11950, 11948, 1332312, 354090, 11884, 1352534, 1395610, 1395611, 1395612, 1395613, 1395614, 1395615, 1395616, 1395617, 1395618, 1395619, 1395620, 1341019, 11801, 11809, 1511763, 1394983, 697906, 1072204, 1148801, 1574422, 12104, 763552, 10264, 85708, 759804, 28344, 85506, 33747, 10345, 285986, 220638, 1154691, 185638, 1169627, 1045778, 185636, 72201, 345198, 176652, 1301280, 68347, 1618248, 1618254, 10288, 198112, 1454023, 1454024, 1454025, 1278278, 1278246, 1278252, 1278247, 1278248, 1278249, 1278250, 1278251, 399781, 1278255, 346932, 1278261, 1278263, 1278265, 1474867, 1379694, 1521385, 1521387, 1521389, 938081, 938082, 880162, 251749, 455370, 169864, 1379788, 1608440, 642253, 642255, 1224510, 1592207, 1592212, 1592083, 1592085, 1592086, 1592088, 1592093, 1592095, 1592096, 1592081, 1843761, 1519405, 1557033, 1608451, 664785, 1435438, 1170653, 40979, 12235, 12138, 11987, 51680, 12056, 146500, 554168, 212035, 1269028, 693272, 1420594, 1094892, 1128140, 1235314, 1128143, 1128151, 1128131, 1450746, 1461100, 181522, 1424633, 1010698, 1299317, 1450749, 1416631, 1128422, 1034806, 1592112, 1592113, 1592127, 938080, 1074214, 1519385, 1519387, 1519389, 1519390, 1519395, 1519396, 1519397, 186617, 1262072, 1407671, 743583, 340016, 745107, 745102, 745100, 1416009, 1187128, 889876, 760732, 1243183, 1229760, 1481186, 1505225, 1560342, 233894, 115987, 260149, 227470, 926067, 1127514, 1296654, 294382, 1486657, 1084719, 10756, 1486662, 1285382, 1497851, 1127515, 145579, 263375, 764562, 1133292, 1133022, 242527, 260373, 279280, 644524, 242861, 1132026, 1357714, 1197951, 1327981, 1327976, 1327979, 1327992, 1328030, 1327990, 1327980, 1327972, 1327982, 1327995, 1327983, 1327970, 1327971, 756279, 1327977, 1327993, 1328029, 1327975, 1327974, 1327985, 756280, 756282, 1527524, 1540094, 1042123, 541865, 1567016, 765765, 1176422, 1327037, 1162295, 1141135, 1141136, 335924, 536444, 929832, 682650, 1137745, 536473, 749413, 1477406, 1048515, 1048516, 1048517, 1048520, 1048521, 1537091, 1264700, 1609634, 1455074, 414970, 10863, 10864, 1222338, 1147148, 1237364, 1414766, 1977402, 948870, 1524881, 10665, 10760, 1147094, 1429767, 925983, 925984, 1527519, 1527506, 1229753, 1540097, 1540098, 1054461, 1391223, 294631, 1325731, 908819, 1458858, 1458842, 90963, 1536592, 1527515, 551895, 1129191, 139872, 201847, 287412, 1262517, 754044, 1385658, 1176423, 889949, 446529, 1034128, 1056830, 1089119, 1486472, 1034111, 205879, 1340709, 1567475, 1472912, 1204539, 1399915, 1283076, 1283077, 1168479, 1168478, 440250, 400567, 994601, 1465639, 889956, 445700, 444862, 536454, 445688, 444861, 1229794, 1229793, 1229792, 1229791, 1229790, 1229789, 1229786, 1229787, 1229788, 1229784, 1229782, 376758, 1498188, 504501, 504553, 1235647, 1235648, 1235649, 1235650, 1235653, 1235654, 1235655, 1235656, 1235657, 877240, 754052, 1316739, 347326, 1235689, 31535, 757342, 582345, 1462581, 386793, 1204517, 347327, 1335230, 743813, 1348912, 1327964, 270673, 188350, 1541891, 169683, 998086, 1500757, 1458843, 1129146, 1279082, 1114179, 1548900, 1231048, 1548901, 1449437, 1548918, 1476390, 462590, 754048, 948071, 1481785, 1417599, 1131316, 691965, 136084, 754067, 1161935, 1173749, 1173761, 1173759, 1173762, 590739, 1406795, 1141134, 1204529, 1540099, 1168549, 866889, 1458859, 1458860, 1458861, 10761, 754060, 1524882, 1357423, 373126, 1150991, 1195080, 320843, 55510, 1434319, 320850, 369581, 537874, 1208587, 1566990, 10732, 490913, 1526550, 1340810, 756277, 753084, 753085, 756275, 1026955, 1340812, 238854, 555387, 754042, 444860, 981335, 469660, 215796, 1478972, 1385659, 926697, 336724, 278008, 1211417, 271647, 754075, 573173, 573174, 979525, 979534, 1529058, 1283071, 573176, 1589298, 1076759, 1461743, 1150989, 754058, 754051, 929835, 1414739, 754072, 1524880, 194802, 1168281, 1204514, 1188795, 331278] "2015 Mukherjee, S. et al.": url: "http://doi.org/10.1186/1944-3277-10-18" ids: [10847] From d1c7df1b109668045114a7c6928a70f7d0335b9f Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 10 Feb 2022 11:59:04 +0100 Subject: [PATCH 30/50] legend on metadata heatmap --- grimer/callbacks.py | 15 ++++++++++++-- grimer/cds.py | 12 ++++------- grimer/config.py | 2 +- grimer/grimer.py | 1 + grimer/layout.py | 1 + grimer/plots.py | 49 ++++++++++++++++++++++++++++++++++++++------- 6 files changed, 62 insertions(+), 18 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 6367d90..77fa250 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -575,6 +575,9 @@ def link_heatmap_widgets(ele, def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols): metadata_multiselect_callback = CustomJS( args=dict(metadata_heatmap=ele["metadata"]["fig"], + metadata_multiselect=ele["metadata"]["wid"]["metadata_multiselect"], + legend_colorbars=ele["metadata"]["wid"]["legend_colorbars"], + toggle_legend=ele["metadata"]["wid"]["toggle_legend"], max_metadata_cols=max_metadata_cols, cds_p_metadata=cds_p_metadata, cds_d_metadata=cds_d_metadata), @@ -583,9 +586,11 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols var x_factors = []; var empty_y_values = new Array(index_len); for (var i = 0; i < index_len; ++i) empty_y_values[i]=["", ""]; + // hide all legends + for (let md_header in legend_colorbars) legend_colorbars[md_header].visible = false; for(var s=0; s < max_metadata_cols; ++s){ - if (s Date: Thu, 10 Feb 2022 15:52:34 +0100 Subject: [PATCH 31/50] sort by tax --- grimer/callbacks.py | 15 ++++++++++++--- grimer/cds.py | 1 - grimer/plots.py | 22 +++++++++------------- 3 files changed, 21 insertions(+), 17 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 77fa250..b913d37 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -460,10 +460,19 @@ def link_heatmap_widgets(ele, // Sorting var factors = []; var sort_col = []; - if (x_sort_select.value=="counts"){ + if (x_sort_select.value.startsWith("tax|")){ + const group_rank = x_sort_select.value.replace("tax|",""); for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { if(cds_p_obstable.data["col|rank"][i]==rank){ - factors.push(cds_p_obstable.data["index"][i]) + factors.push(cds_p_obstable.data["index"][i]); + sort_col.push(cds_p_obstable.data["tax|" + group_rank][i]); + } + } + sorted_factors = grimer_sort(factors, sort_col, "string", false); + }else if (x_sort_select.value=="counts"){ + for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { + if(cds_p_obstable.data["col|rank"][i]==rank){ + factors.push(cds_p_obstable.data["index"][i]); sort_col.push(cds_p_obstable.data["col|total_counts"][i]); } } @@ -471,7 +480,7 @@ def link_heatmap_widgets(ele, }else if (x_sort_select.value=="observations"){ for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { if(cds_p_obstable.data["col|rank"][i]==rank){ - factors.push(cds_p_obstable.data["index"][i]) + factors.push(cds_p_obstable.data["index"][i]); sort_col.push(cds_p_obstable.data["col|name"][i]); } } diff --git a/grimer/cds.py b/grimer/cds.py index cc22884..aac4257 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -319,7 +319,6 @@ def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): #Drop zeros based on original counts if not show_zeros: stacked_rank_df = stacked_rank_df[stacked_rank_df["ov"] > 0] - df_heatmap = pd.concat([df_heatmap, stacked_rank_df], axis=0) df_heatmap.drop('ov', axis=1, inplace=True) diff --git a/grimer/plots.py b/grimer/plots.py index 2c9ba37..a689d9d 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -341,9 +341,8 @@ def plot_obstable_widgets(sizes, dict_d_taxname, max_count_rank): counts_perc_avg_spinner = Spinner(title="Avg. counts/sample", low=0, high=100, value=0, step=0.1, width=spinner_width, height=50) total_counts_spinner = Spinner(title="Total counts", low=1, high=max_count_rank, step=1, value=1, width=spinner_width, height=50) # Create unique list of names with taxids for filtering. map to str and set to get unique - unique_dict_d_taxname_tuples = set(zip(dict_d_taxname.keys(), map(str, dict_d_taxname.values()))) - name_multichoice = MultiChoice(title="Obs. name or id", - options=list(unique_dict_d_taxname_tuples), + name_multichoice = MultiChoice(title="Observation name or id", + options=list(set(zip(dict_d_taxname.keys(), map(str, dict_d_taxname.values())))), sizing_mode="fixed", width=sizes["overview_top_panel_width_left"] - 20, height=60) @@ -372,8 +371,6 @@ def plot_obstable_widgets(sizes, dict_d_taxname, max_count_rank): "help_button": help_button(title="Observation table", text=help_text, align="start")} - - def plot_sampletable(cds_p_sampletable, sizes, ranks): table_cols = [] @@ -675,17 +672,12 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax color_mapper.low = min(cds_p_heatmap.data["tv"]) color_mapper.high = max(cds_p_heatmap.data["tv"]) - # Convert taxid ticks to taxa names on client-side - heatmap.xaxis.formatter = FuncTickFormatter(args=dict(dict_d_taxname=dict_d_taxname), code=''' - return dict_d_taxname[tick]; - ''') - heatmap.rect(x="obs", y="index", width=1, height=1, source=cds_p_heatmap, fill_color={'field': 'tv', 'transform': color_mapper}, line_color=None) - color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, border_line_color=None, location="center", orientation="horizontal") + color_bar = ColorBar(color_mapper=color_mapper, label_standoff=6, height=10, border_line_color=None, location="center", orientation="horizontal") heatmap.add_layout(color_bar, 'above') # Convert taxid ticks to taxa names on client-side @@ -725,6 +717,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name x_sort_options["Sort by Controls"] = [("annot|" + c, c) for c in controls_names] if decontam: x_sort_options["Sort by DECONTAM"] = [("annot|decontam", "decontam")] + x_sort_options["Sort by taxonomic rank"] = [("tax|" + r, r) for r in ranks] y_sort_options = {} y_sort_options["Clustering Method/Metric"] = cluster_options @@ -752,6 +745,8 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name The bottom-most panel shows annotations for each observation/taxa values (x-axis). The metadata and annotation plots are automatically sorted to reflect the clustering/sort of the heatmap. + +**If the panels are not properly aligned after data selection, use the reset tool (top right) to re-align them** """ return {"rank_select": rank_select, @@ -871,7 +866,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada metadata_fig.add_tools(HoverTool(tooltips=tooltips, formatters=formatters)) for col in cols: - metadata_fig.rect(x=dict(value=col), y="index", + metadata_fig.rect(x={"value": col}, y="index", width=1, height=1, source=cds_p_metadata, fill_color={'field': col, 'transform': metadata_colormap}, @@ -881,7 +876,8 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada for i, md_header in enumerate(metadata_fields): # Start showing only first - if i == 0: legend_colorbars[md_header].visible = True + if i == 0: + legend_colorbars[md_header].visible = True metadata_fig.add_layout(legend_colorbars[md_header], 'right') metadata_fig.xaxis.axis_label = "metadata" From f8ea4e48516636634de9eb93dceb5a7a023dbdfe Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 10 Feb 2022 17:28:55 +0100 Subject: [PATCH 32/50] lower default decontam threhsold, treat boolean fields metadata --- config/default.yaml | 2 +- grimer/metadata.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/default.yaml b/config/default.yaml index cd4bd40..42d4bb3 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -9,7 +9,7 @@ references: external: mgnify: "files/mgnify.tsv" decontam: - threshold: 0.2 # [0-1] + threshold: 0.1 # [0-1] method: "frequency" # frequency, prevalence, combined # # frequency (default: use sum of counts) # frequency_file: "path/file1.txt" diff --git a/grimer/metadata.py b/grimer/metadata.py index 4160879..d1135ab 100644 --- a/grimer/metadata.py +++ b/grimer/metadata.py @@ -26,7 +26,7 @@ def __init__(self, metadata_file, samples: list=[]): self.types[self.data.dtypes.map(is_numeric_dtype)] = "numeric" # Convert datatypes to adequate numeric values (int, float) - self.data = self.data.convert_dtypes(infer_objects=False, convert_string=False) + self.data = self.data.convert_dtypes(infer_objects=False, convert_string=False, convert_boolean=False) # Re-convert everython to object to standardize (int64 NA is not seriazable on bokeh) self.data = self.data.astype("object") @@ -39,7 +39,7 @@ def __init__(self, metadata_file, samples: list=[]): # Convert NaN on categorical to "" self.data[self.types[self.types == "categorical"].index] = self.data[self.types[self.types == "categorical"].index].fillna('') - # Convert boolean to String + # Convert boolean from categorical to String mask = self.data[self.types[self.types == "categorical"].index].applymap(type) != bool self.data[self.types[self.types == "categorical"].index] = self.data[self.types[self.types == "categorical"].index].where(mask, self.data[self.types[self.types == "categorical"].index].replace({True: 'True', False: 'False'})) From 95ddb1d734a08b5506b3256d062a2ddd1aef6e84 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 11 Feb 2022 16:06:19 +0100 Subject: [PATCH 33/50] working on 2.4 --- grimer/plots.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/grimer/plots.py b/grimer/plots.py index a689d9d..2c55ce6 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -1,7 +1,7 @@ import markdown # Bokeh -from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, FactorRange, FixedTicker, FuncTickFormatter, HoverTool, Legend, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, PrintfTickFormatter, Range1d, RangeSlider, Select, Spacer, Spinner, Tabs, TextAreaInput, TextInput +from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, FactorRange, FixedTicker, FuncTickFormatter, HoverTool, Legend, LegendItem, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, PrintfTickFormatter, Range1d, RangeSlider, Select, Spacer, Spinner, Tabs, TextAreaInput, TextInput from bokeh.models.filters import IndexFilter, GroupFilter from bokeh.models.widgets import DataTable, TableColumn from bokeh.palettes import Blues, Dark2, Magma256, Reds @@ -164,10 +164,10 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna legend_bars_items = [] for i in range(top_obs_bars): if i < len(dict_d_topobs[ranks[0]]): - label = str(i+1) + ") " + dict_d_taxname[dict_d_topobs[ranks[0]][i]] + label = str(i + 1) + ") " + dict_d_taxname[dict_d_topobs[ranks[0]][i]] else: label = None - legend_bars_items.append((label, [vbar_ren[i]])) + legend_bars_items.append(LegendItem(label=label, renderers=[vbar_ren[i]])) legend_obsbars = Legend(items=legend_bars_items) # legend_bars.label_text_font_size="9px" @@ -629,7 +629,7 @@ def plot_mgnify(sizes, cds_p_mgnify): def plot_mgnify_widgets(): - biome_spinner = Spinner(title="Biome level", low=1, high=5, value=1, step=1, width=100, height=50, orientation="horizontal") + biome_spinner = Spinner(title="Biome level", low=1, high=5, value=1, step=1, width=100, height=50)#, orientation="horizontal") help_text = """ Pie chart with the number of occurrences of the selected taxa in the table by environment (biome) in other microbiome studies analyzed and publicly available at the [MGNify](https://www.ebi.ac.uk/metagenomics) [1] resource. From f56548cebb393e14362b92fbf1b73ab730f59c5c Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 15 Feb 2022 16:56:48 +0100 Subject: [PATCH 34/50] fix metadata subtable --- grimer/table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grimer/table.py b/grimer/table.py index 11e9874..5740455 100644 --- a/grimer/table.py +++ b/grimer/table.py @@ -55,7 +55,7 @@ def get_subtable(self, rank, samples: list=[], taxids: list=[], keep_shape: bool valid_samples.append(s) if valid_samples: - subtable = subtable.loc[valid_samples] + subtable = subtable.loc[subtable.index.intersection(valid_samples)] if not keep_shape: subtable = subtable.loc[:, subtable.sum(axis=0) > 0] else: From 375143363567c6396e033c4e997ffca419bd4fbe Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 17 Feb 2022 17:06:45 +0100 Subject: [PATCH 35/50] groupby heatmap --- grimer/callbacks.py | 152 +++++++++++++++++++++++++++++--------------- grimer/cds.py | 10 ++- grimer/grimer.py | 3 +- grimer/layout.py | 6 +- grimer/plots.py | 34 +++++++--- grimer/utils.py | 6 +- 6 files changed, 144 insertions(+), 67 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index b913d37..c1cc5d7 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -199,10 +199,13 @@ def link_obstable_samplebars(ele, change_text_legend_obs_callback = CustomJS( args=dict(cds_p_obstable=cds_p_obstable, legend_obs=ele["samplebars"]["legend_obs"], + samplebars=ele["samplebars"]["fig"], active_ranks=active_ranks), code=''' // selected row - var row = cb_obj.indices[0]; + const row = cb_obj.indices[0]; + const selected_rank = cds_p_obstable.data['col|rank'][row]; + for(let r = 0; r < active_ranks.length; r++){ let taxid = cds_p_obstable.data["tax|"+active_ranks[r]][row]; if (taxid){ @@ -210,6 +213,12 @@ def link_obstable_samplebars(ele, }else{ legend_obs.items[r].label = active_ranks[r]; } + // activate only selected rank + if(active_ranks[r]==selected_rank){ + samplebars.renderers[r+3].visible=true; + }else{ + samplebars.renderers[r+3].visible=false; + } } ''') @@ -420,16 +429,17 @@ def link_heatmap_widgets(ele, dict_d_dedro_y, cds_p_annotations, cds_p_obstable, - cds_p_heatmap): + cds_p_heatmap, + active_ranks): x_dendro_callback = CustomJS( args=dict(rank_select=ele["heatmap"]["wid"]["rank_select"], - x_sort_select=ele["heatmap"]["wid"]["x_sort_select"], + x_groupby_select=ele["heatmap"]["wid"]["x_groupby_select"], cds_p_dendro_x=cds_p_dendro_x, dict_d_dedro_x=dict_d_dedro_x), code=''' - if (x_sort_select.value.startsWith("cluster|")){ - const key = rank_select.value+"|"+x_sort_select.value.replace("cluster|",""); + if (x_groupby_select.value.startsWith("cluster|")){ + const key = rank_select.value+"|"+x_groupby_select.value.replace("cluster|",""); cds_p_dendro_x.data = {"x": dict_d_dedro_x[key+"|x"], "y": dict_d_dedro_x[key+"|y"], "c": dict_d_dedro_x[key+"|c"]}; @@ -440,70 +450,109 @@ def link_heatmap_widgets(ele, x_select_callback = CustomJS( args=dict(heatmap=ele["heatmap"]["fig"], + active_ranks=active_ranks, rank_select=ele["heatmap"]["wid"]["rank_select"], x_sort_select=ele["heatmap"]["wid"]["x_sort_select"], + x_groupby_select=ele["heatmap"]["wid"]["x_groupby_select"], dict_d_hcluster_x=dict_d_hcluster_x, cds_p_annotations=cds_p_annotations, - cds_p_obstable=cds_p_obstable), + cds_p_obstable=cds_p_obstable, + cds_p_heatmap=cds_p_heatmap), code=''' + + // selected rank const rank = rank_select.value; + + // get index to access data from observations from cds_p_obstable + var obs_index = []; + for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { + if(cds_p_obstable.data["col|rank"][i]==rank){ + obs_index.push(i); + } + } + var annot_obs = obs_index.map( s => cds_p_obstable.data["index"][s] ); + var sorted_factors = []; - if (x_sort_select.value=="none"){ - // None - sorted_factors = dict_d_hcluster_x["default|" + rank]; - }else if (x_sort_select.value.startsWith("cluster|")){ - // Clustering - // Get sorted elements based on rank|method|metric - const key = rank+"|"+x_sort_select.value.replace("cluster|",""); - sorted_factors = dict_d_hcluster_x[key]; + var dict_factors = {}; + if (x_groupby_select.value.startsWith("cluster|")){ + // Clustering - Get sorted elements based on rank|method|metric + sorted_factors = dict_d_hcluster_x[rank+"|"+x_groupby_select.value.replace("cluster|","")]; + // Default dict_factors + for(let i = 0; i < annot_obs.length; i++){ + dict_factors[annot_obs[i]] = annot_obs[i]; + } }else{ - // Sorting - var factors = []; + // Define value from Sort by select var sort_col = []; - if (x_sort_select.value.startsWith("tax|")){ - const group_rank = x_sort_select.value.replace("tax|",""); - for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { - if(cds_p_obstable.data["col|rank"][i]==rank){ - factors.push(cds_p_obstable.data["index"][i]); - sort_col.push(cds_p_obstable.data["tax|" + group_rank][i]); - } - } - sorted_factors = grimer_sort(factors, sort_col, "string", false); - }else if (x_sort_select.value=="counts"){ - for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { - if(cds_p_obstable.data["col|rank"][i]==rank){ - factors.push(cds_p_obstable.data["index"][i]); - sort_col.push(cds_p_obstable.data["col|total_counts"][i]); - } - } - sorted_factors = grimer_sort(factors, sort_col, "numeric", false); + var sort_col_type = "string"; + if (x_sort_select.value=="none"){ + sort_col = dict_d_hcluster_x["default|" + rank]; }else if (x_sort_select.value=="observations"){ - for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { - if(cds_p_obstable.data["col|rank"][i]==rank){ - factors.push(cds_p_obstable.data["index"][i]); - sort_col.push(cds_p_obstable.data["col|name"][i]); - } - } - sorted_factors = grimer_sort(factors, sort_col, "string", false); - }else{ - // copy array of factors - factors = [...dict_d_hcluster_x["default|" + rank]]; + sort_col = obs_index.map( s => cds_p_obstable.data["col|name"][s] ); + }else if (x_sort_select.value=="counts"){ + sort_col = obs_index.map( s => cds_p_obstable.data["col|total_counts"][s] ); + sort_col_type = "numeric"; + }else if (x_sort_select.value.startsWith("annot|")){ const annot = x_sort_select.value.replace("annot|",""); + // create array with zeros, mark with one if annotation is present + sort_col = new Array(annot_obs.length); for (let i=0; i -1) { - sorted_factors.push(factors[index]); // add to the sorted_factors - factors.splice(index, 1); //remove from factors - } + sort_col[annot_obs.indexOf(cds_p_annotations.data["index"][i])] = 1; + } + } + sort_col_type = "numeric"; + } + + if(x_groupby_select.value=="none"){ + sorted_factors = grimer_sort(annot_obs, sort_col, sort_col_type, false); + // Default dict_factors + for(let i = 0; i < annot_obs.length; i++){ + dict_factors[annot_obs[i]] = annot_obs[i]; + } + }else if (x_groupby_select.value.startsWith("tax|")){ + const group_rank = x_groupby_select.value.replace("tax|",""); + + // group entries without selected rank with space " " + var groupby_col = obs_index.map(function(s) { return cds_p_obstable.data["tax|" + group_rank][s] == "" ? " " : cds_p_obstable.data["tax|" + group_rank][s]; }); + + // if grouping with a higher rank + if(active_ranks.indexOf(rank) > active_ranks.indexOf(group_rank)){ + for(let i = 0; i < annot_obs.length; i++){ + dict_factors[annot_obs[i]] = [groupby_col[i], annot_obs[i]]; + } + sorted_factors = grimer_sort(Object.values(dict_factors), sort_col, sort_col_type, false, groupby_col); + }else{ + // normal sort + sorted_factors = grimer_sort(annot_obs, sort_col, sort_col_type, false); + // Default dict_factors + for(let i = 0; i < annot_obs.length; i++){ + dict_factors[annot_obs[i]] = annot_obs[i]; } + } - // join annotated and left-overs - sorted_factors = sorted_factors.concat(factors); } } + + // update factors on heatmap col otherwise remove + for (let i = 0; i < cds_p_heatmap.data["index"].length; i++) { + if(cds_p_heatmap.data["rank"][i]==rank){ + cds_p_heatmap.data["factors_obs"][i] = dict_factors[cds_p_heatmap.data["obs"][i]]; + }else{ + cds_p_heatmap.data["factors_obs"][i] = ""; + } + } + + for (let i = 0; i < cds_p_annotations.data["index"].length; i++) { + if(cds_p_annotations.data["rank"][i]==rank){ + cds_p_annotations.data["factors"][i] = dict_factors[cds_p_annotations.data["index"][i]]; + }else{ + cds_p_annotations.data["factors"][i] = ""; + } + } + heatmap.x_range.factors = sorted_factors; + ''') y_dendro_callback = CustomJS( @@ -578,6 +627,7 @@ def link_heatmap_widgets(ele, ele["heatmap"]["wid"]["toggle_labels"].js_on_click(toggle_labels_callback) ele["heatmap"]["wid"]["rank_select"].js_on_change('value', x_select_callback, x_dendro_callback, y_select_callback, y_dendro_callback) ele["heatmap"]["wid"]["x_sort_select"].js_on_change('value', x_select_callback, x_dendro_callback) + ele["heatmap"]["wid"]["x_groupby_select"].js_on_change('value', x_select_callback, x_dendro_callback) ele["heatmap"]["wid"]["y_sort_select"].js_on_change('value', y_select_callback, y_dendro_callback) diff --git a/grimer/cds.py b/grimer/cds.py index aac4257..1288171 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -47,7 +47,7 @@ def generate_cds_annotations(table, references, controls, decontam): # index -> taxids # columns -> rank, annot - df_annotations = pd.DataFrame(columns=["rank", "annot"]) + df_annotations = pd.DataFrame(columns=["rank", "annot", "factors"]) for rank in table.ranks(): # Generate a DataFrame to use as source in tables df_rank = pd.DataFrame(index=table.observations(rank)) @@ -70,6 +70,8 @@ def generate_cds_annotations(table, references, controls, decontam): if "val" in df_rank.columns: df_rank.drop(columns="val", inplace=True) # drop boolean col + df_rank["factors"] = df_rank.index + # Concat in the main df df_annotations = pd.concat([df_annotations, df_rank], axis=0) @@ -308,7 +310,7 @@ def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): # ov -> original value (raw counts) # tv -> transformed values (user choice: log10, clr, ...) - df_heatmap = pd.DataFrame(columns=["obs", "rank", "ov", "tv"]) + df_heatmap = pd.DataFrame(columns=["obs", "rank", "ov", "tv", "factors_sample", "factors_obs"]) for rank in table.ranks(): stacked_rank_df = pd.DataFrame(table.data[rank].stack(), columns=["ov"]).reset_index(1) # Rename first col to obs @@ -319,6 +321,10 @@ def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): #Drop zeros based on original counts if not show_zeros: stacked_rank_df = stacked_rank_df[stacked_rank_df["ov"] > 0] + # initialize factors + stacked_rank_df["factors_sample"] = stacked_rank_df.index + stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] + df_heatmap = pd.concat([df_heatmap, stacked_rank_df], axis=0) df_heatmap.drop('ov', axis=1, inplace=True) diff --git a/grimer/grimer.py b/grimer/grimer.py index 11d3c14..61795f0 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -360,7 +360,8 @@ def main(argv=sys.argv[1:]): dict_d_dedro_y, cds_p_annotations, cds_p_obstable, - cds_p_heatmap) + cds_p_heatmap, + table.ranks()) link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols) diff --git a/grimer/layout.py b/grimer/layout.py index 36b4a8c..58feb55 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -90,8 +90,10 @@ def make_layout(ele, sizes, version, logo_path, title): ele["heatmap"]["wid"]["toggle_labels"], sizing_mode="stretch_height", width=300), - row(ele["heatmap"]["wid"]["x_sort_select"], - ele["heatmap"]["wid"]["y_sort_select"], + row(column(ele["heatmap"]["wid"]["x_sort_select"], + ele["heatmap"]["wid"]["x_groupby_select"]), + column(ele["heatmap"]["wid"]["y_sort_select"], + Spacer()), sizing_mode="stretch_width"), column(ele["metadata"]["wid"]["metadata_multiselect"], ele["metadata"]["wid"]["toggle_legend"], diff --git a/grimer/plots.py b/grimer/plots.py index 2c55ce6..dcde002 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -1,12 +1,12 @@ import markdown # Bokeh -from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, FactorRange, FixedTicker, FuncTickFormatter, HoverTool, Legend, LegendItem, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, PrintfTickFormatter, Range1d, RangeSlider, Select, Spacer, Spinner, Tabs, TextAreaInput, TextInput +from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, CustomJSTransform, FactorRange, FixedTicker, FuncTickFormatter, HoverTool, Legend, LegendItem, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, PrintfTickFormatter, Range1d, RangeSlider, Select, Spacer, Spinner, Tabs, TextAreaInput, TextInput from bokeh.models.filters import IndexFilter, GroupFilter from bokeh.models.widgets import DataTable, TableColumn from bokeh.palettes import Blues, Dark2, Magma256, Reds from bokeh.plotting import figure -from bokeh.transform import cumsum, factor_cmap +from bokeh.transform import cumsum, factor_cmap, transform from grimer.utils import format_js_toString, make_color_palette @@ -50,7 +50,8 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): name="tax|" + rank, # to work with hover properly source=cds_p_samplebars, marker="circle", size=7, line_color="navy", alpha=0.6, - fill_color=obs_palette[i]) + fill_color=obs_palette[i], + visible=False) legend_obs_items.append((rank, [ren])) # Legend counts (vbars) @@ -672,7 +673,7 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax color_mapper.low = min(cds_p_heatmap.data["tv"]) color_mapper.high = max(cds_p_heatmap.data["tv"]) - heatmap.rect(x="obs", y="index", width=1, height=1, + heatmap.rect(x="factors_obs", y="factors_sample", width=1, height=1, source=cds_p_heatmap, fill_color={'field': 'tv', 'transform': color_mapper}, line_color=None) @@ -685,6 +686,7 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax return dict_d_taxname[tick]; ''') + heatmap.xaxis.group_label_orientation = "vertical" heatmap.xaxis.major_label_orientation = "vertical" heatmap.xgrid.grid_line_color = None heatmap.ygrid.grid_line_color = None @@ -710,14 +712,17 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name cluster_options.append(("cluster|" + lmethod + "|" + lmetric, lmethod + "/" + lmetric)) x_sort_options = {} - x_sort_options["Clustering Method/Metric"] = cluster_options x_sort_options["Default order"] = [("none", "none"), ("counts", "counts"), ("observations", "observations")] x_sort_options["Sort by References"] = [("annot|" + r, r) for r in reference_names] if controls_names: x_sort_options["Sort by Controls"] = [("annot|" + c, c) for c in controls_names] if decontam: x_sort_options["Sort by DECONTAM"] = [("annot|decontam", "decontam")] - x_sort_options["Sort by taxonomic rank"] = [("tax|" + r, r) for r in ranks] + + x_groupby_options = {} + x_groupby_options["Default"] = [("none", "none")] + x_groupby_options["Clustering Method/Metric"] = cluster_options + x_groupby_options["Taxonomic rank"] = [("tax|" + r, r) for r in ranks] y_sort_options = {} y_sort_options["Clustering Method/Metric"] = cluster_options @@ -730,7 +735,8 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name if categorical_md_data: y_sort_options["Sort by Categorical Metadata"] = [("metadata_cat|" + md, md) for md in categorical_md_data] - x_sort_select = Select(title="Observation cluster/sort:", value="none", options=x_sort_options) + x_sort_select = Select(title="Observation sort:", value="none", options=x_sort_options) + x_groupby_select = Select(title="Observation cluster/group by:", value="none", options=x_groupby_options) y_sort_select = Select(title="Sample cluster/sort:", value="none", options=y_sort_options) toggle_labels = CheckboxGroup(labels=["Show/Hide observations labels", "Show/Hide samples labels"], active=[]) @@ -751,6 +757,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name return {"rank_select": rank_select, "x_sort_select": x_sort_select, + "x_groupby_select": x_groupby_select, "y_sort_select": y_sort_select, "toggle_labels": toggle_labels, "help_button": help_button(title="Heatmap/Clustering", text=help_text)} @@ -847,8 +854,8 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada else: unique_palette = make_color_palette(n) legend_colorbars[md_header].color_mapper = LinearColorMapper(palette=unique_palette, low=0, high=n) - legend_colorbars[md_header].ticker = FixedTicker(ticks=[t+0.5 for t in range(n)]) - legend_colorbars[md_header].major_label_overrides = {i+0.5: unique_values[i] for i in range(n)} + legend_colorbars[md_header].ticker = FixedTicker(ticks=[t + 0.5 for t in range(n)]) + legend_colorbars[md_header].major_label_overrides = {i + 0.5: unique_values[i] for i in range(n)} assert len(unique_palette) == n, 'Wrong number of colors on palette' palette.extend(unique_palette) @@ -925,7 +932,14 @@ def plot_annotations(heatmap, tools_heatmap, cds_p_annotations, dict_d_taxname): formatters={"@index": taxid_name_custom} )) - annot_fig.rect(x="index", y="annot", + # trans = CustomJSTransform( + # args=dict(heatmap=heatmap), + # v_func=""" + # console.log(xs); return heatmap.x_range.factors; + # """) + # annot_fig.rect(x=transform(("index","rank"), trans), y="annot", + + annot_fig.rect(x="factors", y="annot", width=1, height=1, source=cds_p_annotations, fill_color="black", diff --git a/grimer/utils.py b/grimer/utils.py index abb4d2c..c76fd59 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -142,6 +142,8 @@ def trim_table(table_df): def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): + from grimer.grimer import _debug + # Transpose table (obseravations as index) and expand ranks in columns ranks_df = table_df.T.index.str.split(level_separator, expand=True).to_frame(index=False) @@ -198,6 +200,8 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): invalid = lin_count[(lin_count > 1).any(axis=1)].index.to_list() if invalid: print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r) + if _debug: + print_log(",".join(invalid) + " observations removed with invalid lineage at " + r) # Set to NaN to keep shape of ranks_df ranks_df.loc[ranks_df[r].isin(invalid), r] = np.nan @@ -621,7 +625,7 @@ def print_df(df, name: str=None): if isinstance(df, dict): if df: print(len(df.keys()), "keys:", list(df.keys())[0], "...", list(df.keys())[-1]) - # print(list(df.values())[0], "...", list(df.values())[-1]) + print(list(df.values())[0], "...", list(df.values())[-1]) else: #print(df.columns) print(df.head()) From 62e3ea2d28f66f0279adf0b9f6f9bd29bcae369c Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 18 Feb 2022 16:21:33 +0100 Subject: [PATCH 36/50] heatmap with grouping in both axis --- grimer/callbacks.py | 140 ++++++++++++++++++++++++++++++++------------ grimer/cds.py | 19 +++--- grimer/grimer.py | 6 +- grimer/js/func.js | 2 +- grimer/layout.py | 11 ++-- grimer/plots.py | 65 +++++++++++++------- grimer/utils.py | 4 +- 7 files changed, 168 insertions(+), 79 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index c1cc5d7..5d2e723 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -421,6 +421,7 @@ def link_obstable_samplebars(ele, def link_heatmap_widgets(ele, cds_d_samples, cds_d_metadata, + cds_p_metadata, dict_d_hcluster_x, dict_d_hcluster_y, cds_p_dendro_x, @@ -430,11 +431,14 @@ def link_heatmap_widgets(ele, cds_p_annotations, cds_p_obstable, cds_p_heatmap, - active_ranks): + active_ranks, + dict_d_taxname): x_dendro_callback = CustomJS( args=dict(rank_select=ele["heatmap"]["wid"]["rank_select"], x_groupby_select=ele["heatmap"]["wid"]["x_groupby_select"], + x_sort_select=ele["heatmap"]["wid"]["x_sort_select"], + dendrox=ele["dendrox"]["fig"], cds_p_dendro_x=cds_p_dendro_x, dict_d_dedro_x=dict_d_dedro_x), code=''' @@ -443,8 +447,13 @@ def link_heatmap_widgets(ele, cds_p_dendro_x.data = {"x": dict_d_dedro_x[key+"|x"], "y": dict_d_dedro_x[key+"|y"], "c": dict_d_dedro_x[key+"|c"]}; + x_sort_select.value="none"; + x_sort_select.disabled=true; + dendrox.visible=true; }else{ cds_p_dendro_x.data = {"x": [], "y": [], "c": []}; + x_sort_select.disabled=false; + dendrox.visible=false; } ''') @@ -457,9 +466,9 @@ def link_heatmap_widgets(ele, dict_d_hcluster_x=dict_d_hcluster_x, cds_p_annotations=cds_p_annotations, cds_p_obstable=cds_p_obstable, - cds_p_heatmap=cds_p_heatmap), + cds_p_heatmap=cds_p_heatmap, + dict_d_taxname=dict_d_taxname), code=''' - // selected rank const rank = rank_select.value; @@ -470,28 +479,29 @@ def link_heatmap_widgets(ele, obs_index.push(i); } } + var annot_obs = obs_index.map( s => cds_p_obstable.data["index"][s] ); var sorted_factors = []; var dict_factors = {}; if (x_groupby_select.value.startsWith("cluster|")){ - // Clustering - Get sorted elements based on rank|method|metric - sorted_factors = dict_d_hcluster_x[rank+"|"+x_groupby_select.value.replace("cluster|","")]; // Default dict_factors for(let i = 0; i < annot_obs.length; i++){ dict_factors[annot_obs[i]] = annot_obs[i]; } + // Clustering - Get sorted elements based on rank|method|metric + sorted_factors = dict_d_hcluster_x[rank+"|"+x_groupby_select.value.replace("cluster|","")]; }else{ // Define value from Sort by select var sort_col = []; - var sort_col_type = "string"; + var sort_col_type = "numeric"; if (x_sort_select.value=="none"){ - sort_col = dict_d_hcluster_x["default|" + rank]; + sort_col = obs_index; }else if (x_sort_select.value=="observations"){ sort_col = obs_index.map( s => cds_p_obstable.data["col|name"][s] ); + sort_col_type = "string"; }else if (x_sort_select.value=="counts"){ sort_col = obs_index.map( s => cds_p_obstable.data["col|total_counts"][s] ); - sort_col_type = "numeric"; }else if (x_sort_select.value.startsWith("annot|")){ const annot = x_sort_select.value.replace("annot|",""); // create array with zeros, mark with one if annotation is present @@ -501,35 +511,33 @@ def link_heatmap_widgets(ele, sort_col[annot_obs.indexOf(cds_p_annotations.data["index"][i])] = 1; } } - sort_col_type = "numeric"; } if(x_groupby_select.value=="none"){ - sorted_factors = grimer_sort(annot_obs, sort_col, sort_col_type, false); // Default dict_factors for(let i = 0; i < annot_obs.length; i++){ dict_factors[annot_obs[i]] = annot_obs[i]; } + sorted_factors = grimer_sort(annot_obs, sort_col, sort_col_type, false); }else if (x_groupby_select.value.startsWith("tax|")){ const group_rank = x_groupby_select.value.replace("tax|",""); - - // group entries without selected rank with space " " - var groupby_col = obs_index.map(function(s) { return cds_p_obstable.data["tax|" + group_rank][s] == "" ? " " : cds_p_obstable.data["tax|" + group_rank][s]; }); - // if grouping with a higher rank if(active_ranks.indexOf(rank) > active_ranks.indexOf(group_rank)){ + // group entries without selected rank with space " " + var groupby_col = obs_index.map(function(s) { return cds_p_obstable.data["tax|" + group_rank][s] == "" ? " " : dict_d_taxname[cds_p_obstable.data["tax|" + group_rank][s]]; }); + var factors = []; for(let i = 0; i < annot_obs.length; i++){ dict_factors[annot_obs[i]] = [groupby_col[i], annot_obs[i]]; + factors.push([groupby_col[i], annot_obs[i]]); } - sorted_factors = grimer_sort(Object.values(dict_factors), sort_col, sort_col_type, false, groupby_col); + sorted_factors = grimer_sort(factors, sort_col, sort_col_type, false, groupby_col); }else{ - // normal sort - sorted_factors = grimer_sort(annot_obs, sort_col, sort_col_type, false); // Default dict_factors for(let i = 0; i < annot_obs.length; i++){ dict_factors[annot_obs[i]] = annot_obs[i]; } - + // normal sort + sorted_factors = grimer_sort(annot_obs, sort_col, sort_col_type, false); } } } @@ -552,22 +560,28 @@ def link_heatmap_widgets(ele, } heatmap.x_range.factors = sorted_factors; - ''') y_dendro_callback = CustomJS( args=dict(rank_select=ele["heatmap"]["wid"]["rank_select"], + y_groupby_select=ele["heatmap"]["wid"]["y_groupby_select"], y_sort_select=ele["heatmap"]["wid"]["y_sort_select"], + dendroy=ele["dendroy"]["fig"], cds_p_dendro_y=cds_p_dendro_y, dict_d_dedro_y=dict_d_dedro_y), code=''' - if (y_sort_select.value.startsWith("cluster|")){ - const key = rank_select.value+"|"+y_sort_select.value.replace("cluster|",""); + if (y_groupby_select.value.startsWith("cluster|")){ + const key = rank_select.value+"|"+y_groupby_select.value.replace("cluster|",""); cds_p_dendro_y.data = {"x": dict_d_dedro_y[key+"|x"], - "y": dict_d_dedro_y[key+"|y"], - "c": dict_d_dedro_y[key+"|c"]}; + "y": dict_d_dedro_y[key+"|y"], + "c": dict_d_dedro_y[key+"|c"]}; + y_sort_select.value="none"; + y_sort_select.disabled=true; + dendroy.visible=true; }else{ cds_p_dendro_y.data = {"x": [], "y": [], "c": []}; + y_sort_select.disabled=false; + dendroy.visible=false; } ''') @@ -575,31 +589,78 @@ def link_heatmap_widgets(ele, args=dict(heatmap=ele["heatmap"]["fig"], cds_d_samples=cds_d_samples, cds_d_metadata=cds_d_metadata, + cds_p_metadata=cds_p_metadata, + cds_p_heatmap=cds_p_heatmap, rank_select=ele["heatmap"]["wid"]["rank_select"], y_sort_select=ele["heatmap"]["wid"]["y_sort_select"], + y_groupby_select=ele["heatmap"]["wid"]["y_groupby_select"], dict_d_hcluster_y=dict_d_hcluster_y), code=''' + // selected rank + const rank = rank_select.value; + var annot_samples = cds_d_samples.data["index"]; + var sorted_factors = []; - if (y_sort_select.value=="none"){ - // None - sorted_factors = dict_d_hcluster_y["default"]; - }else if (y_sort_select.value.startsWith("cluster|")){ - // Clustering - // Get sorted elements based on rank|method|metric - const key = rank_select.value+"|"+y_sort_select.value.replace("cluster|",""); - sorted_factors = dict_d_hcluster_y[key]; + var dict_factors = {}; + if (y_groupby_select.value.startsWith("cluster|")){ + // Clustering - Get sorted elements based on rank|method|metric + sorted_factors = dict_d_hcluster_y[rank+"|"+y_groupby_select.value.replace("cluster|","")]; + // Default dict_factors + for(let i = 0; i < annot_samples.length; i++){ + dict_factors[annot_samples[i]] = annot_samples[i]; + } }else{ - // Sorting - if (y_sort_select.value=="counts"){ - sorted_factors = grimer_sort(cds_d_samples.data["index"], cds_d_samples.data["cnt|total"], "numeric", false); + // Define value from Sort by select + var sort_col = []; + var sort_col_type = "string"; + if (y_sort_select.value=="none"){ + sort_col = dict_d_hcluster_y["default"]; }else if (y_sort_select.value=="samples"){ - sorted_factors = grimer_sort(cds_d_samples.data["index"], cds_d_samples.data["index"], "string", false); - }else if (y_sort_select.value.startsWith("metadata_cat|")){ - sorted_factors = grimer_sort(cds_d_samples.data["index"], cds_d_metadata.data[y_sort_select.value.replace("metadata_cat|","")], "string", false); + sort_col = annot_samples; + }else if (y_sort_select.value=="counts"){ + sort_col = cds_d_samples.data["cnt|total"]; + sort_col_type = "numeric"; }else if (y_sort_select.value.startsWith("metadata_num|")){ - sorted_factors = grimer_sort(cds_d_samples.data["index"], cds_d_metadata.data[y_sort_select.value.replace("metadata_num|","")], "numeric", false); + sort_col = cds_d_metadata.data[y_sort_select.value.replace("metadata_num|","")]; + sort_col_type = "numeric"; + }else if (y_sort_select.value.startsWith("metadata_cat|")){ + sort_col = cds_d_metadata.data[y_sort_select.value.replace("metadata_cat|","")]; + } + + if(y_groupby_select.value=="none"){ + sorted_factors = grimer_sort(annot_samples, sort_col, sort_col_type, false); + // Default dict_factors + for(let i = 0; i < annot_samples.length; i++){ + dict_factors[annot_samples[i]] = annot_samples[i]; + } + }else if (y_groupby_select.value.startsWith("group_metadata|")){ + const group_metadata = y_groupby_select.value.replace("group_metadata|",""); + + // group entries without metadata with space " " + var groupby_col = cds_d_metadata.data[group_metadata]; + + for(let i = 0; i < annot_samples.length; i++){ + dict_factors[annot_samples[i]] = [groupby_col[i], annot_samples[i]]; + } + sorted_factors = grimer_sort(Object.values(dict_factors), sort_col, sort_col_type, false, groupby_col); + } } + + + // update factors on heatmap col otherwise remove + for (let i = 0; i < cds_p_heatmap.data["index"].length; i++) { + if(cds_p_heatmap.data["rank"][i]==rank){ + cds_p_heatmap.data["factors_sample"][i] = dict_factors[cds_p_heatmap.data["index"][i]]; + }else{ + cds_p_heatmap.data["factors_sample"][i] = ""; + } + } + + for (let i = 0; i < cds_p_metadata.data["index"].length; i++) { + cds_p_metadata.data["factors"][i] = dict_factors[cds_p_metadata.data["index"][i]]; + } + heatmap.y_range.factors = sorted_factors; ''') @@ -625,10 +686,11 @@ def link_heatmap_widgets(ele, ''') ele["heatmap"]["wid"]["toggle_labels"].js_on_click(toggle_labels_callback) - ele["heatmap"]["wid"]["rank_select"].js_on_change('value', x_select_callback, x_dendro_callback, y_select_callback, y_dendro_callback) + ele["heatmap"]["wid"]["rank_select"].js_on_change('value', y_select_callback, x_select_callback, x_dendro_callback, y_dendro_callback) ele["heatmap"]["wid"]["x_sort_select"].js_on_change('value', x_select_callback, x_dendro_callback) ele["heatmap"]["wid"]["x_groupby_select"].js_on_change('value', x_select_callback, x_dendro_callback) ele["heatmap"]["wid"]["y_sort_select"].js_on_change('value', y_select_callback, y_dendro_callback) + ele["heatmap"]["wid"]["y_groupby_select"].js_on_change('value', y_select_callback, y_dendro_callback) def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols): diff --git a/grimer/cds.py b/grimer/cds.py index 1288171..8774a1d 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -48,7 +48,7 @@ def generate_cds_annotations(table, references, controls, decontam): # columns -> rank, annot df_annotations = pd.DataFrame(columns=["rank", "annot", "factors"]) - for rank in table.ranks(): + for i,rank in enumerate(table.ranks()): # Generate a DataFrame to use as source in tables df_rank = pd.DataFrame(index=table.observations(rank)) @@ -70,7 +70,7 @@ def generate_cds_annotations(table, references, controls, decontam): if "val" in df_rank.columns: df_rank.drop(columns="val", inplace=True) # drop boolean col - df_rank["factors"] = df_rank.index + df_rank["factors"] = df_rank.index if i == 0 else "" # Concat in the main df df_annotations = pd.concat([df_annotations, df_rank], axis=0) @@ -236,10 +236,13 @@ def generate_cds_plot_metadata(metadata, max_metadata_cols): # md0, md1, ..., md(max_metadata_cols) # values (metadata field, metadata values) - df_plot_md = pd.DataFrame(index=metadata.data.index, columns=[str(i) for i in range(1, max_metadata_cols + 1)]) + df_plot_md = pd.DataFrame(index=metadata.data.index, columns=["factors"] + [str(i) for i in range(1, max_metadata_cols + 1)]) + df_plot_md["factors"] = df_plot_md.index # Fill in only first metadata field first_field = metadata.get_col_headers()[0] + df_plot_md["1"] = [(first_field, format_js_toString(md_value)) for md_value in metadata.get_col(first_field)] + print_df(df_plot_md, "cds_p_metadata") return ColumnDataSource(df_plot_md) @@ -311,7 +314,7 @@ def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): # tv -> transformed values (user choice: log10, clr, ...) df_heatmap = pd.DataFrame(columns=["obs", "rank", "ov", "tv", "factors_sample", "factors_obs"]) - for rank in table.ranks(): + for i, rank in enumerate(table.ranks()): stacked_rank_df = pd.DataFrame(table.data[rank].stack(), columns=["ov"]).reset_index(1) # Rename first col to obs stacked_rank_df.rename(columns={stacked_rank_df.columns[0]: "obs"}, inplace=True) @@ -321,9 +324,11 @@ def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): #Drop zeros based on original counts if not show_zeros: stacked_rank_df = stacked_rank_df[stacked_rank_df["ov"] > 0] - # initialize factors - stacked_rank_df["factors_sample"] = stacked_rank_df.index - stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] + # initialize factors only for first rank + #stacked_rank_df["factors_sample"] = stacked_rank_df.index + #stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] + stacked_rank_df["factors_sample"] = stacked_rank_df.index if i==0 else "" + stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] if i==0 else "" df_heatmap = pd.concat([df_heatmap, stacked_rank_df], axis=0) diff --git a/grimer/grimer.py b/grimer/grimer.py index 61795f0..806105b 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -55,7 +55,7 @@ def main(argv=sys.argv[1:]): elif args.tax == "ott": tax = OttTx(files=args.tax_files, extended_names=True) else: - print_log(" - No taxonomy set") + print_log("- No taxonomy set") print_log("") # Table of counts @@ -352,6 +352,7 @@ def main(argv=sys.argv[1:]): link_heatmap_widgets(ele, cds_d_samples, cds_d_metadata, + cds_p_metadata, dict_d_hcluster_x, dict_d_hcluster_y, cds_p_dendro_x, @@ -361,7 +362,8 @@ def main(argv=sys.argv[1:]): cds_p_annotations, cds_p_obstable, cds_p_heatmap, - table.ranks()) + table.ranks(), + dict_d_taxname) link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols) diff --git a/grimer/js/func.js b/grimer/js/func.js index bab3693..504f1bd 100644 --- a/grimer/js/func.js +++ b/grimer/js/func.js @@ -50,7 +50,7 @@ function grimer_sort(factors, sort_col, sort_mode="numeric", desc=false, group_c else if (sort_mode=="string" && desc==true) idx.sort((a, b) => sort_string(sort_col[b],sort_col[a])); } - + var sorted_factors = new Array(idx.length); for (var i = 0; i < idx.length; ++i) sorted_factors[i] = factors[idx[i]]; return sorted_factors; diff --git a/grimer/layout.py b/grimer/layout.py index 58feb55..4428f72 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -82,18 +82,17 @@ def make_layout(ele, sizes, version, logo_path, title): row_heatmap = gridplot([[ele["heatmap"]["fig"], ele["dendroy"]["fig"], ele["metadata"]["fig"]], [ele["dendrox"]["fig"]], - [ele["annotations"]["fig"], ele["heatmap"]["wid"]["help_button"]]], + [ele["annotations"]["fig"], None, ele["heatmap"]["wid"]["help_button"]]], toolbar_location='right', merge_tools=True) row_heatmap_widgets = row(column(ele["heatmap"]["wid"]["rank_select"], ele["heatmap"]["wid"]["toggle_labels"], - sizing_mode="stretch_height", width=300), - row(column(ele["heatmap"]["wid"]["x_sort_select"], - ele["heatmap"]["wid"]["x_groupby_select"]), - column(ele["heatmap"]["wid"]["y_sort_select"], - Spacer()), + row(column(ele["heatmap"]["wid"]["x_groupby_select"], + ele["heatmap"]["wid"]["x_sort_select"]), + column(ele["heatmap"]["wid"]["y_groupby_select"], + ele["heatmap"]["wid"]["y_sort_select"]), sizing_mode="stretch_width"), column(ele["metadata"]["wid"]["metadata_multiselect"], ele["metadata"]["wid"]["toggle_legend"], diff --git a/grimer/plots.py b/grimer/plots.py index dcde002..24f7a28 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -678,8 +678,16 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax fill_color={'field': 'tv', 'transform': color_mapper}, line_color=None) - color_bar = ColorBar(color_mapper=color_mapper, label_standoff=6, height=10, border_line_color=None, location="center", orientation="horizontal") - heatmap.add_layout(color_bar, 'above') + color_bar = ColorBar(color_mapper=color_mapper, + label_standoff=2, + width=6, + height=200, + border_line_color=None, + location="center", + orientation="vertical", + major_label_text_align="left", + major_label_text_font_size="9px") + heatmap.add_layout(color_bar, 'left') # Convert taxid ticks to taxa names on client-side heatmap.xaxis.formatter = FuncTickFormatter(args=dict(dict_d_taxname=dict_d_taxname), code=''' @@ -687,6 +695,7 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax ''') heatmap.xaxis.group_label_orientation = "vertical" + heatmap.yaxis.group_label_orientation = "horizontal" heatmap.xaxis.major_label_orientation = "vertical" heatmap.xgrid.grid_line_color = None heatmap.ygrid.grid_line_color = None @@ -711,33 +720,40 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name for lmethod in linkage_methods: cluster_options.append(("cluster|" + lmethod + "|" + lmetric, lmethod + "/" + lmetric)) + x_groupby_options = {} + x_groupby_options["Default"] = [("none", "none")] + x_groupby_options["Clustering Method/Metric"] = cluster_options + x_groupby_options["Group by taxonomic rank"] = [("tax|" + r, r) for r in ranks] + x_sort_options = {} - x_sort_options["Default order"] = [("none", "none"), ("counts", "counts"), ("observations", "observations")] - x_sort_options["Sort by References"] = [("annot|" + r, r) for r in reference_names] + x_sort_options["Default"] = [("none", "none"), ("counts", "counts"), ("observations", "observations")] + x_sort_options["References"] = [("annot|" + r, r) for r in reference_names] if controls_names: - x_sort_options["Sort by Controls"] = [("annot|" + c, c) for c in controls_names] + x_sort_options["Controls"] = [("annot|" + c, c) for c in controls_names] if decontam: - x_sort_options["Sort by DECONTAM"] = [("annot|decontam", "decontam")] + x_sort_options["DECONTAM"] = [("annot|decontam", "decontam")] - x_groupby_options = {} - x_groupby_options["Default"] = [("none", "none")] - x_groupby_options["Clustering Method/Metric"] = cluster_options - x_groupby_options["Taxonomic rank"] = [("tax|" + r, r) for r in ranks] + y_groupby_options = {} + y_groupby_options["Default"] = [("none", "none")] + y_groupby_options["Clustering Method/Metric"] = cluster_options + categorical_md_data = metadata.get_data(metadata_type="categorical").columns.to_list() + if categorical_md_data: + y_groupby_options["Group by Categorical Metadata"] = [("group_metadata|" + md, md) for md in categorical_md_data] y_sort_options = {} - y_sort_options["Clustering Method/Metric"] = cluster_options - y_sort_options["Default order"] = [("none", "none"), ("counts", "counts"), ("samples", "samples")] + y_sort_options["Default"] = [("none", "none"), ("counts", "counts"), ("samples", "samples")] if metadata: numeric_md_data = metadata.get_data(metadata_type="numeric").columns.to_list() if numeric_md_data: - y_sort_options["Sort by Numeric Metadata"] = [("metadata_num|" + md, md) for md in numeric_md_data] + y_sort_options["Numeric Metadata"] = [("metadata_num|" + md, md) for md in numeric_md_data] categorical_md_data = metadata.get_data(metadata_type="categorical").columns.to_list() if categorical_md_data: - y_sort_options["Sort by Categorical Metadata"] = [("metadata_cat|" + md, md) for md in categorical_md_data] + y_sort_options["Categorical Metadata"] = [("metadata_cat|" + md, md) for md in categorical_md_data] - x_sort_select = Select(title="Observation sort:", value="none", options=x_sort_options) x_groupby_select = Select(title="Observation cluster/group by:", value="none", options=x_groupby_options) - y_sort_select = Select(title="Sample cluster/sort:", value="none", options=y_sort_options) + x_sort_select = Select(title="Observation sort:", value="none", options=x_sort_options) + y_groupby_select = Select(title="Sample cluster/group by:", value="none", options=y_groupby_options) + y_sort_select = Select(title="Sample sort:", value="none", options=y_sort_options) toggle_labels = CheckboxGroup(labels=["Show/Hide observations labels", "Show/Hide samples labels"], active=[]) @@ -756,8 +772,9 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name """ return {"rank_select": rank_select, - "x_sort_select": x_sort_select, "x_groupby_select": x_groupby_select, + "x_sort_select": x_sort_select, + "y_groupby_select": y_groupby_select, "y_sort_select": y_sort_select, "toggle_labels": toggle_labels, "help_button": help_button(title="Heatmap/Clustering", text=help_text)} @@ -769,12 +786,14 @@ def plot_dendrogram(heatmap, tools_heatmap, cds_p_dendro_x, cds_p_dendro_y): tools="save", height=80, sizing_mode="stretch_width", - tooltips=[("y", "$y{(0.00)}"), ("c", "$swatch:c")]) + #tooltips=[("y", "$y{(0.00)}"), ("c", "$swatch:c")], + visible=False) dendroy_fig = figure(y_range=heatmap.y_range, tools="save", width=80, height=heatmap.height, - tooltips=[("x", "$x{(0.00)}"), ("c", "$swatch:c")]) + #tooltips=[("x", "$x{(0.00)}"), ("c", "$swatch:c")], + visible=False) dendroy_fig.multi_line(xs="x", ys="y", color="c", source=cds_p_dendro_y) @@ -808,7 +827,7 @@ def plot_dendrogram(heatmap, tools_heatmap, cds_p_dendro_x, cds_p_dendro_y): def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metadata): # Get fixed headers from cds - cols = list(cds_p_metadata.data.keys())[1:] + cols = list(cds_p_metadata.data.keys())[2:] metadata_fig = figure(x_range=cols, y_range=heatmap.y_range, @@ -817,7 +836,6 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada width=300, height=heatmap.height, tooltips="") - metadata_fig.xaxis.major_label_orientation = 0.7 metadata_fields = metadata.get_col_headers().to_list() @@ -873,7 +891,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada metadata_fig.add_tools(HoverTool(tooltips=tooltips, formatters=formatters)) for col in cols: - metadata_fig.rect(x={"value": col}, y="index", + metadata_fig.rect(x={"value": col}, y="factors", width=1, height=1, source=cds_p_metadata, fill_color={'field': col, 'transform': metadata_colormap}, @@ -891,11 +909,13 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada metadata_fig.xaxis.major_label_orientation = "vertical" metadata_fig.xaxis.major_label_text_font_size = "11px" metadata_fig.xaxis.minor_tick_line_color = None + metadata_fig.xgrid.grid_line_color = None metadata_fig.yaxis.major_tick_line_color = None metadata_fig.yaxis.minor_tick_line_color = None metadata_fig.yaxis.major_label_text_font_size = '0pt' metadata_fig.yaxis.axis_line_color = None + metadata_fig.yaxis.group_text_font_size = "0px" metadata_fig.ygrid.grid_line_color = None metadata_multiselect = MultiSelect(title="Metadata to show (select max. " + str(len(cols)) + " columns):", value=[metadata_fields[0]], options=metadata_fields) @@ -952,6 +972,7 @@ def plot_annotations(heatmap, tools_heatmap, cds_p_annotations, dict_d_taxname): annot_fig.yaxis.minor_tick_line_color = None annot_fig.xaxis.major_tick_line_color = None annot_fig.xaxis.major_label_text_font_size = "0px" + annot_fig.xaxis.group_text_font_size = "0px" return annot_fig diff --git a/grimer/utils.py b/grimer/utils.py index c76fd59..844662a 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -185,7 +185,7 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): # Add nan to keep missing ranks (different than tax.undefined_node [None] which will keep the name) updated_nodes[np.nan] = np.nan - ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] != np.nan else t) + ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] is not None else t) del updated_nodes[np.nan] unmatched_nodes += list(updated_nodes.values()).count(tax.undefined_node) @@ -625,7 +625,7 @@ def print_df(df, name: str=None): if isinstance(df, dict): if df: print(len(df.keys()), "keys:", list(df.keys())[0], "...", list(df.keys())[-1]) - print(list(df.values())[0], "...", list(df.values())[-1]) + #print(list(df.values())[0], "...", list(df.values())[-1]) else: #print(df.columns) print(df.head()) From d5959d4640ae2d8926ad0e18ef0cc25c2c70d712 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 18 Feb 2022 18:09:07 +0100 Subject: [PATCH 37/50] scaled annotations --- grimer/callbacks.py | 4 ++-- grimer/cds.py | 43 +++++++++++++++++++++++++++++++------------ grimer/decontam.py | 1 + grimer/grimer.py | 2 +- grimer/plots.py | 27 +++++++++++++++++++-------- 5 files changed, 54 insertions(+), 23 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 5d2e723..ab514c1 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -505,10 +505,10 @@ def link_heatmap_widgets(ele, }else if (x_sort_select.value.startsWith("annot|")){ const annot = x_sort_select.value.replace("annot|",""); // create array with zeros, mark with one if annotation is present - sort_col = new Array(annot_obs.length); for (let i=0; i taxids # columns -> rank, annot - df_annotations = pd.DataFrame(columns=["rank", "annot", "factors"]) - for i,rank in enumerate(table.ranks()): + df_annotations = pd.DataFrame(columns=["rank", "annot", "factors", "ov", "tv"]) + for i, rank in enumerate(table.ranks()): # Generate a DataFrame to use as source in tables df_rank = pd.DataFrame(index=table.observations(rank)) if decontam: - df_rank["decontam"] = decontam.get_contaminants(rank, df_rank.index) + df_rank["decontam"] = decontam.get_pvalues(rank, df_rank.index)[decontam.get_contaminants(rank, df_rank.index).values] for desc, ref in references.items(): - df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) >= 1 + df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) + df_rank.loc[df_rank[desc] == 0, desc] = np.nan if controls: for desc, ctrl in controls.items(): - df_rank[desc] = table.observations(rank).map(lambda x: ctrl.get_refs_count(x, direct=True)) >= 1 + control_table = table.get_subtable(samples=control_samples[desc], rank=rank) + freq_perc_control = control_table.gt(0).sum(axis=0) / control_table.shape[0] + df_rank[desc] = table.observations(rank).map(freq_perc_control).to_list() - df_rank = pd.DataFrame(df_rank.stack(), columns=["val"]).reset_index(1) + df_rank = pd.DataFrame(df_rank.stack(), columns=["ov"]).reset_index(1) df_rank.rename(columns={"level_1": "annot"}, inplace=True) - df_rank = df_rank[df_rank["val"]] # keep only true entries - df_rank["rank"] = rank # set rank - if "val" in df_rank.columns: - df_rank.drop(columns="val", inplace=True) # drop boolean col + # add transformed values to fit same scale on heatmap + # Decontam reverse p-score normalized + if not df_rank[df_rank["annot"] == "decontam"].empty: + min_val = df_rank[df_rank["annot"] == "decontam"]["ov"].min() + max_val = df_rank[df_rank["annot"] == "decontam"]["ov"].max() + df_rank.loc[df_rank["annot"] == "decontam", "tv"] = 1 - ((df_rank[df_rank["annot"] == "decontam"]["ov"] - min_val) / (max_val - min_val)) - df_rank["factors"] = df_rank.index if i == 0 else "" + # max references divided by max + for desc, ref in references.items(): + if not df_rank[df_rank["annot"] == desc].empty: + max_val = df_rank[df_rank["annot"] == desc]["ov"].max() + df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] / max_val + + # keep same percentage + if controls: + for desc, ctrl in controls.items(): + if not df_rank.loc[df_rank["annot"] == desc].empty: + df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] + + df_rank["rank"] = rank # set rank + df_rank["factors"] = df_rank.index if i == 0 else "" # initialize just for first rank (save space) # Concat in the main df df_annotations = pd.concat([df_annotations, df_rank], axis=0) + print_df(df_annotations, "cds_p_annotations") return ColumnDataSource(df_annotations) diff --git a/grimer/decontam.py b/grimer/decontam.py index 7cd6fc9..867783a 100644 --- a/grimer/decontam.py +++ b/grimer/decontam.py @@ -29,6 +29,7 @@ def add_rank_results(self, rank, decontam_out_file, decontam_mod_file): def add_rank_empty(self, rank, idx): self.rank[rank] = pd.DataFrame(index=idx, columns=self.cols_rank + ["contam", "contam_2", "non.contam"]) + self.rank[rank]["contaminant"] = False def get_data(self): return self.data.fillna(False) diff --git a/grimer/grimer.py b/grimer/grimer.py index 806105b..3ab135d 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -198,7 +198,7 @@ def main(argv=sys.argv[1:]): # matrix: index (unique sample-ids), md0, md1, ..., md(max_metadata_cols) -> (metadata field, metadata values) cds_p_metadata = generate_cds_plot_metadata(metadata, max_metadata_cols) if metadata else None # stacked: index (repeated observations), rank, annot - cds_p_annotations = generate_cds_annotations(table, references, controls, decontam) + cds_p_annotations = generate_cds_annotations(table, references, controls, decontam, control_samples) # empty matrix {"x": [], "y": [], "c": []} cds_p_dendro_x, cds_p_dendro_y = generate_cds_plot_dendro() if not args.skip_dendrogram else [None, None] # stacked: index (repeated observations), other observation, rank, rho diff --git a/grimer/plots.py b/grimer/plots.py index 24f7a28..71a8bbc 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -948,21 +948,32 @@ def plot_annotations(heatmap, tools_heatmap, cds_p_annotations, dict_d_taxname): ) # Add custom tooltip for heatmap (taxid->name) annot_fig.add_tools(HoverTool( - tooltips=[('Annotation', '@annot'), ('Observation', '@index{custom}')], + tooltips=[('Annotation', '@annot'), + ('Observation', '@index{custom}'), + ('Original Value', '@ov'), + ('Transformed Value', '@tv')], formatters={"@index": taxid_name_custom} )) - # trans = CustomJSTransform( - # args=dict(heatmap=heatmap), - # v_func=""" - # console.log(xs); return heatmap.x_range.factors; - # """) - # annot_fig.rect(x=transform(("index","rank"), trans), y="annot", + color_palette = Magma256[::-1] + color_mapper = LinearColorMapper(palette=color_palette, low=0, high=1) + + color_bar = ColorBar(color_mapper=color_mapper, + label_standoff=2, + width=6, + height=60, + border_line_color=None, + location="center", + orientation="vertical", + major_label_text_align="left", + major_label_text_font_size="9px") + annot_fig.add_layout(color_bar, 'left') annot_fig.rect(x="factors", y="annot", width=1, height=1, source=cds_p_annotations, - fill_color="black", + #fill_color="black", + fill_color={'field': 'tv', 'transform': color_mapper}, line_color=None) annot_fig.yaxis.axis_label = "annotations" From aab425c7eebf12917b8e404e979e774eaf3803ad Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Tue, 22 Feb 2022 19:09:11 +0100 Subject: [PATCH 38/50] reverse grouping, bug fixes --- grimer-mgnify.py | 7 +++++-- grimer/callbacks.py | 8 ++++---- grimer/cds.py | 5 ++++- grimer/css/popup.css | 2 +- grimer/js/popup.js | 2 +- grimer/layout.py | 4 ++++ grimer/plots.py | 2 +- 7 files changed, 20 insertions(+), 10 deletions(-) diff --git a/grimer-mgnify.py b/grimer-mgnify.py index 9704a71..1bfa519 100755 --- a/grimer-mgnify.py +++ b/grimer-mgnify.py @@ -26,6 +26,10 @@ files = sorted(files, key=lambda x: os.stat(x).st_size) md = glob.glob(prefix + '*_metadata.tsv*') +if args.grimer_params: + grimer_params = args.grimer_params.split(" ") +else: + grimer_params = [] grimer.grimer.main(["-i", files[-1], "-m", md[-1], "-c", 'config/default.yaml', @@ -35,5 +39,4 @@ "-t", "ncbi", "-o", prefix + ".html", "--title", "MGnify study accession " + args.mgnify_study_accession, - *args.grimer_params.split(" ") - ]) + ] + grimer_params) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index ab514c1..9fac2a4 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -114,10 +114,10 @@ def link_obstable_samplebars(ele, var groupby_col2 = cds_d_metadata.data[groupby2_select.value.replace('metadata_cat|','')]; factors = groupby_col2.map(function(m, i) { - return [m, groupby_col1[i], annot_samples[i]]; + return [groupby_col1[i], m, annot_samples[i]]; }); - sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1, groupby_col2); + sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col2, groupby_col1); }else{ sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1); } @@ -935,11 +935,11 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cd var groupby_col2 = cds_d_metadata.data[groupby2_select.value.replace('metadata_cat|','')]; factors = groupby_col2.map(function(m, i) { - return [m, groupby_col1[i], annot_samples[i]]; + return [groupby_col1[i], m, annot_samples[i]]; }); // only selected_indices - sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1, groupby_col2, selected_indices); + sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col2, groupby_col1, selected_indices); }else{ sorted_factors = grimer_sort(factors, sort_col, "numeric", false, groupby_col1, [], selected_indices); } diff --git a/grimer/cds.py b/grimer/cds.py index ef1e66d..3ff52f5 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -53,7 +53,10 @@ def generate_cds_annotations(table, references, controls, decontam, control_samp df_rank = pd.DataFrame(index=table.observations(rank)) if decontam: - df_rank["decontam"] = decontam.get_pvalues(rank, df_rank.index)[decontam.get_contaminants(rank, df_rank.index).values] + contaminants = decontam.get_contaminants(rank, df_rank.index).values + print(contaminants) + if contaminants.any(): + df_rank["decontam"] = decontam.get_pvalues(rank, df_rank.index)[contaminants] for desc, ref in references.items(): df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) diff --git a/grimer/css/popup.css b/grimer/css/popup.css index 6656aa3..d0262a2 100644 --- a/grimer/css/popup.css +++ b/grimer/css/popup.css @@ -49,4 +49,4 @@ color: #fff; font-size: 32px; cursor: pointer; -} \ No newline at end of file +} diff --git a/grimer/js/popup.js b/grimer/js/popup.js index 62f9525..24612bd 100644 --- a/grimer/js/popup.js +++ b/grimer/js/popup.js @@ -46,4 +46,4 @@ var pop = { pop.pWrap.classList.remove("open"); } }; -window.addEventListener("DOMContentLoaded", pop.init); \ No newline at end of file +window.addEventListener("DOMContentLoaded", pop.init); diff --git a/grimer/layout.py b/grimer/layout.py index 4428f72..4d01971 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -112,6 +112,8 @@ def make_layout(ele, sizes, version, logo_path, title): main_panels.append(Panel(child=column(row_correlation, sizing_mode="stretch_width"), title="Correlation")) main_tab = Tabs(tabs=main_panels) + #loading_div = Div(style={"visibility": "hidden", "display": "flex", "position": "fixed", "z-index": "100", "width": "100%", "height": "100%", "background-color": "rgba(192, 192, 192, 0.5)", "background-image": "url('https://i.stack.imgur.com/MnyxU.gif')", "background-repeat": "no-repeat", "background-position": "center"}) + logo_base64 = base64.b64encode(open(logo_path, 'rb').read()) # encode to base64 logo_base64 = logo_base64.decode() # convert to string logo_div = Div(text='' + 'v' + version + '', width=300, height=40, sizing_mode="fixed") @@ -121,4 +123,6 @@ def make_layout(ele, sizes, version, logo_path, title): title_div = Spacer() final = column([row(logo_div, title_div), main_tab], sizing_mode="stretch_width") + + return final diff --git a/grimer/plots.py b/grimer/plots.py index 71a8bbc..be64634 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -899,7 +899,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada # Show just first when loading metadata_fig.x_range.factors = ["1"] - for i, md_header in enumerate(metadata_fields): + for i, md_header in enumerate(legend_colorbars.keys()): # Start showing only first if i == 0: legend_colorbars[md_header].visible = True From e581d4d1af87a268da665fdc4e3c564034c1c3e5 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 2 Mar 2022 13:53:49 +0100 Subject: [PATCH 39/50] several fixes and improvs. --- config/default.yaml | 2 +- grimer/callbacks.py | 38 ++++++++++++----- grimer/cds.py | 11 +++-- grimer/config.py | 2 +- grimer/decontam.py | 2 +- grimer/grimer.py | 2 +- grimer/layout.py | 11 ++--- grimer/plots.py | 102 +++++++++++++++++++++++++------------------- grimer/utils.py | 1 + 9 files changed, 102 insertions(+), 69 deletions(-) diff --git a/config/default.yaml b/config/default.yaml index 42d4bb3..4d28169 100644 --- a/config/default.yaml +++ b/config/default.yaml @@ -9,7 +9,7 @@ references: external: mgnify: "files/mgnify.tsv" decontam: - threshold: 0.1 # [0-1] + threshold: 0.1 # [0-1] P* hyperparameter method: "frequency" # frequency, prevalence, combined # # frequency (default: use sum of counts) # frequency_file: "path/file1.txt" diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 9fac2a4..c01a7be 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -216,8 +216,8 @@ def link_obstable_samplebars(ele, // activate only selected rank if(active_ranks[r]==selected_rank){ samplebars.renderers[r+3].visible=true; - }else{ - samplebars.renderers[r+3].visible=false; + //}else{ + // samplebars.renderers[r+3].visible=false; } } ''') @@ -302,7 +302,7 @@ def link_obstable_samplebars(ele, cds_p_decontam=cds_p_decontam, cds_p_decontam_models=cds_p_decontam_models, cds_d_decontam=cds_d_decontam, - pvalue_input=ele["decontam"]["wid"]["pvalue_input"]), + pscore_input=ele["decontam"]["wid"]["pscore_input"]), code=''' // selected row const row = cb_obj.indices[0]; @@ -323,11 +323,11 @@ def link_obstable_samplebars(ele, if (lines!=undefined){ cds_p_decontam_models.data["y_cont"] = [lines[0], lines[1]]; cds_p_decontam_models.data["y_noncont"] = [lines[2], lines[2]]; - pvalue_input.value = lines[3].toString(); + pscore_input.value = lines[3].toString(); }else{ cds_p_decontam_models.data["y_cont"] = []; cds_p_decontam_models.data["y_noncont"] = []; - pvalue_input.value = ""; + pscore_input.value = ""; } cds_p_decontam_models.change.emit(); ''') @@ -664,7 +664,7 @@ def link_heatmap_widgets(ele, heatmap.y_range.factors = sorted_factors; ''') - toggle_labels_callback = CustomJS( + toggle_label_callback = CustomJS( args=dict(cds_p_heatmap=cds_p_heatmap, xaxis=ele["heatmap"]["fig"].xaxis[0], yaxis=ele["heatmap"]["fig"].yaxis[0]), @@ -685,7 +685,7 @@ def link_heatmap_widgets(ele, } ''') - ele["heatmap"]["wid"]["toggle_labels"].js_on_click(toggle_labels_callback) + ele["heatmap"]["wid"]["toggle_label"].js_on_click(toggle_label_callback) ele["heatmap"]["wid"]["rank_select"].js_on_change('value', y_select_callback, x_select_callback, x_dendro_callback, y_dendro_callback) ele["heatmap"]["wid"]["x_sort_select"].js_on_change('value', x_select_callback, x_dendro_callback) ele["heatmap"]["wid"]["x_groupby_select"].js_on_change('value', x_select_callback, x_dendro_callback) @@ -801,9 +801,10 @@ def link_correlation_widgets(ele, cds_p_correlation): factors.add(cds_p_correlation.data["taxid"][i]); } } - factors = [...factors].sort(); - correlation.x_range.factors = factors; - correlation.y_range.factors = factors.reverse(); + var sorted_factors = [...factors].sort(); + correlation.y_range.factors = sorted_factors; + var rev_sorted_factors = [...sorted_factors].reverse(); + correlation.x_range.factors = rev_sorted_factors; ''') filter_callback = CustomJS( @@ -825,6 +826,23 @@ def link_correlation_widgets(ele, cds_p_correlation): cds_p_correlation.change.emit(); ''') + toggle_label_callback = CustomJS( + args=dict(xaxis=ele["correlation"]["fig"].xaxis[0], yaxis=ele["correlation"]["fig"].yaxis[0]), + code=''' + if(this.active.includes(0)){ + xaxis.major_label_text_font_size = "10px"; + xaxis.major_tick_line_color="black"; + yaxis.major_label_text_font_size = "10px"; + yaxis.major_tick_line_color="black"; + }else{ + xaxis.major_label_text_font_size = "0px"; + xaxis.major_tick_line_color=null; + yaxis.major_label_text_font_size = "0px"; + yaxis.major_tick_line_color=null; + } + ''') + + ele["correlation"]["wid"]["toggle_label"].js_on_click(toggle_label_callback) ele["correlation"]["wid"]["pos_slider"].js_on_change('value', filter_callback) ele["correlation"]["wid"]["neg_slider"].js_on_change('value', filter_callback) ele["correlation"]["wid"]["rank_select"].js_on_change('value', rank_select_callback) diff --git a/grimer/cds.py b/grimer/cds.py index 3ff52f5..7b5542b 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -54,9 +54,8 @@ def generate_cds_annotations(table, references, controls, decontam, control_samp if decontam: contaminants = decontam.get_contaminants(rank, df_rank.index).values - print(contaminants) if contaminants.any(): - df_rank["decontam"] = decontam.get_pvalues(rank, df_rank.index)[contaminants] + df_rank["decontam"] = decontam.get_pscore(rank, df_rank.index)[contaminants] for desc, ref in references.items(): df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) @@ -289,7 +288,7 @@ def generate_cds_decontam(decontam, ranks): dict_coord_mod = {} for rank in ranks: df_valid_vals = decontam.rank[rank].dropna(subset=['contam']) - pval = decontam.get_pvalues(rank, df_valid_vals.index) + pval = decontam.get_pscore(rank, df_valid_vals.index) vals = list(zip(df_valid_vals["contam"], df_valid_vals["contam_2"], df_valid_vals["non.contam"], pval)) dict_coord_mod.update(dict(zip(df_valid_vals.index, vals))) @@ -460,12 +459,13 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): df_corr = pd.DataFrame(columns=["taxid", "rank", "rho"]) for rank in table.ranks(): if top_obs_corr: - top_taxids = table.get_top(rank, top_obs_corr) + top_taxids = sorted(table.get_top(rank, top_obs_corr)) matrix = table.get_subtable(taxids=top_taxids, rank=rank) else: top_taxids = sorted(table.observations(rank)) matrix = table.data[rank] + print(matrix) # No correlation with just one observation if len(matrix.columns) >= 2: @@ -480,6 +480,7 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): # to save half of the space rho[np.triu_indices(rho.shape[0])] = np.nan + print(rho) stacked_rank_df = pd.DataFrame(rho, index=top_taxids, columns=top_taxids).stack(dropna=False).reset_index(1) stacked_rank_df.rename(columns={"level_1": "taxid"}, inplace=True) stacked_rank_df.rename(columns={0: "rho"}, inplace=True) @@ -488,6 +489,8 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): # Drop NA for rho (missing values and upper triangular matrix) stacked_rank_df.dropna(subset=['rho'], inplace=True) + print(stacked_rank_df) + df_corr = pd.concat([df_corr, stacked_rank_df], axis=0) print_df(df_corr, "cds_p_correlation") diff --git a/grimer/config.py b/grimer/config.py index 48aae4c..4fecc71 100644 --- a/grimer/config.py +++ b/grimer/config.py @@ -52,7 +52,7 @@ def __new__(self, argv=None): heatmap_group.add_argument('--skip-dendrogram', default=False, action='store_true', help="Disable dendogram. Will create smaller files.") correlation_group = parser.add_argument_group('Correlation options') - correlation_group.add_argument('-x', '--top-obs-corr', type=int, default=20, help="Top abundant observations to build the correlationn matrix, based on the avg. percentage counts/sample. 0 for all") + correlation_group.add_argument('-x', '--top-obs-corr', type=int, default=50, help="Top abundant observations to build the correlationn matrix, based on the avg. percentage counts/sample. 0 for all") bars_group = parser.add_argument_group('Bars options') bars_group.add_argument('-j', '--top-obs-bars', type=int, default=20, help="Top abundant observations to show in the bars.") diff --git a/grimer/decontam.py b/grimer/decontam.py index 867783a..8f91cd0 100644 --- a/grimer/decontam.py +++ b/grimer/decontam.py @@ -37,7 +37,7 @@ def get_data(self): def get_contaminants(self, rank, idx): return self.rank[rank].reindex(idx)["contaminant"] - def get_pvalues(self, rank, idx): + def get_pscore(self, rank, idx): return self.rank[rank].reindex(idx)["p"] def get_contaminant_list(self): diff --git a/grimer/grimer.py b/grimer/grimer.py index 3ab135d..ab8fe91 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -274,7 +274,7 @@ def main(argv=sys.argv[1:]): ele["decontam"]["fig"] = plot_decontam(sizes, cds_p_decontam, cds_p_decontam_models, min_obs_perc) else: ele["decontam"]["fig"] = None - ele["decontam"]["wid"] = plot_decontam_widgets() + ele["decontam"]["wid"] = plot_decontam_widgets(sizes) # samplebars ele["samplebars"] = {} diff --git a/grimer/layout.py b/grimer/layout.py index 4d01971..26cd445 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -32,8 +32,8 @@ def make_layout(ele, sizes, version, logo_path, title): if ele["decontam"]["fig"]: info_tabs.append(Panel(child=column(ele["decontam"]["fig"], - row(ele["decontam"]["wid"]["pvalue_text"], - ele["decontam"]["wid"]["pvalue_input"], + row(ele["decontam"]["wid"]["pscore_text"], + ele["decontam"]["wid"]["pscore_input"], ele["decontam"]["wid"]["help_button"]) ), title="DECONTAM")) @@ -87,7 +87,7 @@ def make_layout(ele, sizes, version, logo_path, title): merge_tools=True) row_heatmap_widgets = row(column(ele["heatmap"]["wid"]["rank_select"], - ele["heatmap"]["wid"]["toggle_labels"], + ele["heatmap"]["wid"]["toggle_label"], width=300), row(column(ele["heatmap"]["wid"]["x_groupby_select"], ele["heatmap"]["wid"]["x_sort_select"]), @@ -102,6 +102,7 @@ def make_layout(ele, sizes, version, logo_path, title): row_correlation = row(column(ele["correlation"]["wid"]["rank_select"], ele["correlation"]["wid"]["neg_slider"], ele["correlation"]["wid"]["pos_slider"], + ele["correlation"]["wid"]["toggle_label"], ele["correlation"]["wid"]["help_button"]), ele["correlation"]["fig"]) @@ -112,8 +113,6 @@ def make_layout(ele, sizes, version, logo_path, title): main_panels.append(Panel(child=column(row_correlation, sizing_mode="stretch_width"), title="Correlation")) main_tab = Tabs(tabs=main_panels) - #loading_div = Div(style={"visibility": "hidden", "display": "flex", "position": "fixed", "z-index": "100", "width": "100%", "height": "100%", "background-color": "rgba(192, 192, 192, 0.5)", "background-image": "url('https://i.stack.imgur.com/MnyxU.gif')", "background-repeat": "no-repeat", "background-position": "center"}) - logo_base64 = base64.b64encode(open(logo_path, 'rb').read()) # encode to base64 logo_base64 = logo_base64.decode() # convert to string logo_div = Div(text='' + 'v' + version + '', width=300, height=40, sizing_mode="fixed") @@ -123,6 +122,4 @@ def make_layout(ele, sizes, version, logo_path, title): title_div = Spacer() final = column([row(logo_div, title_div), main_tab], sizing_mode="stretch_width") - - return final diff --git a/grimer/plots.py b/grimer/plots.py index be64634..e36c25c 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -111,7 +111,7 @@ def plot_obsbars(cds_p_obsbars, dict_d_topobs, ranks, top_obs_bars, dict_d_taxna // value holds the column index var taxid = dict_d_topobs.data.dict_d_topobs[0][rank_select.value][value]; if(taxid!=undefined){ - return dict_d_taxname.data.dict_d_taxname[0][taxid]; + return dict_d_taxname.data.dict_d_taxname[0][taxid]; }else{ return value; } @@ -214,7 +214,7 @@ def plot_obsbars_widgets(ranks, metadata, dict_d_topobs, dict_d_taxname, top_obs groupby1_select = Select(title="1) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") groupby2_select = Select(title="2) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") - toggle_label = CheckboxGroup(labels=["Show/Hide samples labels"], active=[]) + toggle_label = CheckboxGroup(labels=["Show samples labels"], active=[]) help_text = """ Observation bars showing proportions of top """ + str(top_obs_bars) + """ most abundant observations. @@ -266,7 +266,7 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec groupby1_select = Select(title="1) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") groupby2_select = Select(title="2) Group samples by", value="none", options=groupby_options, sizing_mode="stretch_width") - toggle_label = CheckboxGroup(labels=["Show/Hide samples labels"], active=[]) + toggle_label = CheckboxGroup(labels=["Show samples labels"], active=[]) help_text = """ Bars showing total counts (left y-axis) for each sample (x-axis). @@ -449,60 +449,62 @@ def plot_decontam(sizes, cds_p_decontam, cds_p_decontam_lines, min_obs_perc): tools="save") palette = make_color_palette(2) - factors = list(set(cds_p_decontam.data["controls"])) - + factors = list(sorted(set(cds_p_decontam.data["controls"]), reverse=True)) # Add legend on top decontam_fig.add_layout(Legend(), 'above') - decontam_fig.circle(x="concentration", y="counts", - source=cds_p_decontam, - color=factor_cmap('controls', palette=palette, factors=factors), - legend_group="controls", - size=3) - - # If there are controls, show legend - if len(factors) > 1: - decontam_fig.legend.margin = 0 - decontam_fig.legend.border_line_width = 0 - decontam_fig.legend.spacing = 0 - decontam_fig.legend.padding = 0 - decontam_fig.legend.orientation = "horizontal" - decontam_fig.legend.location = "bottom_right" - else: - decontam_fig.legend.visible = False + points = decontam_fig.circle(x="concentration", y="counts", + source=cds_p_decontam, + color=factor_cmap('controls', palette=palette, factors=factors), + legend_group="controls", + size=3) + + # Add tooltip just for points + decontam_fig.add_tools(HoverTool(renderers=[points], tooltips=[('Sample', '@index')])) decontam_fig.line(x="x", y="y_cont", source=cds_p_decontam_lines, - color="red") + color="red", + legend_label="Cont.") decontam_fig.line(x="x", y="y_noncont", source=cds_p_decontam_lines, color="black", - line_dash="dashed") - - decontam_fig.xaxis.axis_label = 'DNA Concentration/Total counts' - decontam_fig.yaxis.axis_label = 'obs. counts' + line_dash="dashed", + legend_label="Non-cont.") + + decontam_fig.legend.margin = 0 + decontam_fig.legend.border_line_width = 0 + decontam_fig.legend.spacing = 0 + decontam_fig.legend.padding = 0 + decontam_fig.legend.orientation = "horizontal" + decontam_fig.legend.location = "bottom_right" + + decontam_fig.xaxis.axis_label = 'Concentration' + decontam_fig.yaxis.axis_label = 'Counts' decontam_fig.y_range.start = min_obs_perc decontam_fig.y_range.end = 1 return decontam_fig -def plot_decontam_widgets(): - pvalue_text = Paragraph(text="P-value") - pvalue_input = TextInput(value="", width=180, align='end') +def plot_decontam_widgets(sizes): + pscore_text = Paragraph(text="P-score") + pscore_input = TextInput(value="", width=sizes["overview_top_panel_width_right"] - 150, align='end', disabled=True) help_text = """ -Plot to verify the DECONTAM [1] output. Proportion of counts (y-axis) against DNA Concentration (if provided) or Total number of counts (x-axis), both in log10 scale. If provided, controls samples are displayed in a different color. +Plot to verify the DECONTAM [1] output. Proportion of counts of selected observation (y-axis) against DNA Concentration (if provided) or Total number of counts (x-axis) of each sample, both in log10 scale. If provided, controls samples are displayed in a different color. A indication of contamination is when counts are inversely proportional to DNA concentration. The red and black dotted lines are the expected models for contamination and non-contamination, respectively. A good indication for contamination is when the dots (excluding control samples) "fit" the red line model. +The P-score statistic is not a P-value and it is not associated with any guarantees on the type 1 error rate [1]. Small scores indicate the contaminant model is a better fit, and high scores indicate that the non-contaminant model is a better fit. + More details can be found in the [DECONTAM Introduction guide](https://benjjneb.github.io/decontam/vignettes/decontam_intro.html) [1] Davis, N. M. et al. Simple statistical identification and removal of contaminant sequences in marker-gene and metagenomics data. Microbiome (2018) 10.1186/s40168-018-0605-2. """ - return {"pvalue_text": pvalue_text, - "pvalue_input": pvalue_input, + return {"pscore_text": pscore_text, + "pscore_input": pscore_input, "help_button": help_button(title="DECONTAM", text=help_text, align="start")} @@ -755,7 +757,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name y_groupby_select = Select(title="Sample cluster/group by:", value="none", options=y_groupby_options) y_sort_select = Select(title="Sample sort:", value="none", options=y_sort_options) - toggle_labels = CheckboxGroup(labels=["Show/Hide observations labels", "Show/Hide samples labels"], active=[]) + toggle_label = CheckboxGroup(labels=["Show observations labels", "Show samples label"], active=[]) help_text = """ The heatmap shows [transformed] values from the input table (color bar on top). If taxonomy is provided, one heatmap for each taxonomic rank is generated. @@ -776,7 +778,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name "x_sort_select": x_sort_select, "y_groupby_select": y_groupby_select, "y_sort_select": y_sort_select, - "toggle_labels": toggle_labels, + "toggle_label": toggle_label, "help_button": help_button(title="Heatmap/Clustering", text=help_text)} @@ -927,7 +929,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada return metadata_multiselect.value[tick-1]; ''') - toggle_legend = CheckboxGroup(labels=["Show/Hide metadata legend"], active=[0]) + toggle_legend = CheckboxGroup(labels=["Show metadata legend"], active=[0]) return metadata_fig, {"metadata_multiselect": metadata_multiselect, "legend_colorbars": legend_colorbars, "toggle_legend": toggle_legend} @@ -990,18 +992,23 @@ def plot_annotations(heatmap, tools_heatmap, cds_p_annotations, dict_d_taxname): def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): taxids = set() - for i, rank in enumerate(cds_p_correlation.data["rank"]): - if rank == ranks[0]: - taxids.add(cds_p_correlation.data["index"][i]) - taxids.add(cds_p_correlation.data["taxid"][i]) - - taxids = sorted(taxids) - corr_fig = figure(x_range=taxids, - y_range=list(reversed(taxids)), + taxids.update(cds_p_correlation.data["index"]) + taxids.update(cds_p_correlation.data["taxid"]) + corr_fig = figure(x_range=sorted(taxids, reverse=True), + y_range=sorted(taxids), tools="hover,save,reset,crosshair,tap,box_zoom", tooltips="", sizing_mode="scale_height") + # Start showing only first rank + factors = set() + for i, rank in enumerate(cds_p_correlation.data["rank"]): + if rank == ranks[0]: + factors.add(cds_p_correlation.data["index"][i]) + factors.add(cds_p_correlation.data["taxid"][i]) + corr_fig.x_range.factors = sorted(factors, reverse=True) + corr_fig.y_range.factors = sorted(factors) + # Need to pass dict_d_taxname inside a one column data taxid_name_custom = CustomJSHover( args=dict(dict_d_taxname=ColumnDataSource(dict(dict_d_taxname=[dict_d_taxname]))), @@ -1022,7 +1029,7 @@ def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): rho_filter = IndexFilter() cds_view_correlation = CDSView(source=cds_p_correlation, filters=[rho_filter]) - corr_fig.rect(x="taxid", y="index", + corr_fig.rect(x="index", y="taxid", width=1, height=1, source=cds_p_correlation, view=cds_view_correlation, @@ -1050,6 +1057,11 @@ def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): corr_fig.yaxis.minor_tick_line_color = None corr_fig.xaxis.major_label_orientation = "vertical" + corr_fig.xaxis.major_tick_line_color = None + corr_fig.xaxis.major_label_text_font_size = "0px" + corr_fig.yaxis.major_tick_line_color = None + corr_fig.yaxis.major_label_text_font_size = "0px" + return corr_fig, rho_filter @@ -1057,6 +1069,7 @@ def plot_correlation_widgets(ranks, top_obs_corr): rank_select = Select(title="Taxonomic rank:", value=ranks[0], options=ranks) neg_slider = RangeSlider(start=-1, end=0, value=(-1, 0), step=.01, title="Negative correlation") pos_slider = RangeSlider(start=0, end=1, value=(0, 1), step=.01, title="Positive correlation") + toggle_label = CheckboxGroup(labels=["Show observations labels"], active=[]) help_text = """ Symmetric proportionality coefficient (rho correlation) [1,2] between the top """ + str(top_obs_corr) + """ most abundant observations, based on log-ratios (clr). Only half matrix is displayed, since the values are symmetric. @@ -1071,6 +1084,7 @@ def plot_correlation_widgets(ranks, top_obs_corr): return {"rank_select": rank_select, "neg_slider": neg_slider, "pos_slider": pos_slider, + "toggle_label": toggle_label, "help_button": help_button(title="Correlation", text=help_text)} diff --git a/grimer/utils.py b/grimer/utils.py index 844662a..1044356 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -385,6 +385,7 @@ def run_decontam(cfg, table, metadata, control_samples): return None decontam = Decontam(df_decontam) + # Run DECONTAM for each for each for rank in table.ranks(): From 6b6b2885b65c9ea2a7294991e3a50a67db69e62c Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Wed, 2 Mar 2022 17:14:06 +0100 Subject: [PATCH 40/50] working with norm data --- grimer/cds.py | 29 ++++++++++++----------------- grimer/config.py | 1 + grimer/grimer.py | 35 ++++++++++++++++++++++++----------- grimer/plots.py | 19 +++++++++++-------- grimer/table.py | 30 +++++++++++++++++++++++------- grimer/utils.py | 27 ++++++++++++++++----------- 6 files changed, 87 insertions(+), 54 deletions(-) diff --git a/grimer/cds.py b/grimer/cds.py index 7b5542b..72bc90b 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -100,7 +100,7 @@ def generate_cds_annotations(table, references, controls, decontam, control_samp return ColumnDataSource(df_annotations) -def generate_cds_obstable(table, tax, references, controls, control_samples, decontam): +def generate_cds_obstable(table, tax, references, controls, control_samples, decontam, normalized): # index unique taxids # col|... values to plot to columns in the datatable # tax|... auxiliary lineage of taxa entries @@ -119,9 +119,9 @@ def generate_cds_obstable(table, tax, references, controls, control_samples, dec # Frequency of taxa among all samples df_rank["col|frequency_perc"] = table.get_frequency_perc(rank) + df_rank["col|counts_perc_avg"] = table.get_counts_perc_avg_samples(rank) # Average percentage of counts among all samples - df_rank["col|counts_perc_avg"] = table.get_counts_perc_avg(rank) - df_rank["col|total_counts"] = table.get_total_counts(rank) + df_rank["col|total_counts"] = table.get_counts(rank) # If active - add decontam True/False results if decontam: @@ -157,17 +157,16 @@ def generate_cds_obstable(table, tax, references, controls, control_samples, dec return ColumnDataSource(df_obstable) -def generate_cds_sampletable(table): +def generate_cds_sampletable(table, normalized): # index unique sample-ids # col|... values to plot to columns in the datatable df_sampletable = pd.DataFrame(index=table.samples) - df_sampletable["col|total"] = table.total - assigned = table.total - table.unassigned - df_sampletable["col|assigned"] = assigned - df_sampletable["col|assigned_perc"] = assigned.divide(table.total, axis=0) - df_sampletable["col|unassigned"] = table.unassigned - df_sampletable["col|unassigned_perc"] = table.unassigned.divide(table.total, axis=0) + df_sampletable["col|total"] = table.get_total() if not normalized else 0 + df_sampletable["col|assigned"] = table.get_assigned() if not normalized else 0 + df_sampletable["col|assigned_perc"] = table.get_assigned_perc() + df_sampletable["col|unassigned"] = table.get_unassigned() if not normalized else 0 + df_sampletable["col|unassigned_perc"] = table.get_unassigned_perc() # assigned by rank for rank in table.ranks(): @@ -188,11 +187,11 @@ def generate_cds_samplebars(table): df_bars = pd.DataFrame(index=table.samples) # factors: set the x-axis reference for plotting, it can be dinamically changed (with groups) df_bars["aux|factors"] = df_bars.index - df_bars["bar|unassigned"] = table.unassigned + df_bars["bar|unassigned"] = table.get_unassigned() # Initialized with "Assigned" of first rank - df_bars["bar|selected"] = table.data[table.ranks()[0]].sum(axis=1) + df_bars["bar|selected"] = table.get_subtable(table.ranks()[0]).sum(axis=1) # Total assigned - assigned to rank - df_bars["bar|others"] = (table.total - table.unassigned) - df_bars["bar|selected"] + df_bars["bar|others"] = (table.get_total() - table.get_unassigned()) - df_bars["bar|selected"] # Add empty cols for taxa values, to be dynamically inserted (None to avoid printing 0) for rank in table.ranks(): df_bars["tax|" + rank] = None @@ -465,7 +464,6 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): top_taxids = sorted(table.observations(rank)) matrix = table.data[rank] - print(matrix) # No correlation with just one observation if len(matrix.columns) >= 2: @@ -480,7 +478,6 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): # to save half of the space rho[np.triu_indices(rho.shape[0])] = np.nan - print(rho) stacked_rank_df = pd.DataFrame(rho, index=top_taxids, columns=top_taxids).stack(dropna=False).reset_index(1) stacked_rank_df.rename(columns={"level_1": "taxid"}, inplace=True) stacked_rank_df.rename(columns={0: "rho"}, inplace=True) @@ -489,8 +486,6 @@ def generate_cds_correlation(table, top_obs_corr, replace_zero_value): # Drop NA for rho (missing values and upper triangular matrix) stacked_rank_df.dropna(subset=['rho'], inplace=True) - print(stacked_rank_df) - df_corr = pd.concat([df_corr, stacked_rank_df], axis=0) print_df(df_corr, "cds_p_correlation") diff --git a/grimer/config.py b/grimer/config.py index 4fecc71..b6ffc2d 100644 --- a/grimer/config.py +++ b/grimer/config.py @@ -27,6 +27,7 @@ def __new__(self, argv=None): table_group = parser.add_argument_group('Table options') table_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hiearchical multi-level table where the observations headers are separated by the indicated separator characther (usually ';' or '|')") + table_group.add_argument('-y', '--values', default=None, type=str, help="Force 'count' or 'normalized' data parsing. Empty to auto-detect.") table_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table (if samples are listed on columns and observations on rows)") table_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.") table_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on table observations labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") diff --git a/grimer/grimer.py b/grimer/grimer.py index ab8fe91..f749e9b 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -68,6 +68,19 @@ def main(argv=sys.argv[1:]): args.transpose = True table_df, total, unassigned = parse_input_table(args.input_file, args.unassigned_header, args.transpose, args.sample_replace) + + if args.values == "count": + normalized = False + elif args.values == "normalized": + normalized = True + elif (table_df.sum(axis=1).round() == 100).all() or (table_df % 1 != 0).any().any(): + normalized = True + else: + normalized = False + + if normalized: + print_log("- Normalized values") + if args.level_separator: ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace) else: @@ -77,8 +90,7 @@ def main(argv=sys.argv[1:]): print_log("Could not parse input table") return 1 - table = Table(table_df.index, total, unassigned) - table.lineage = lineage + table = Table(table_df.index, total, unassigned, lineage, normalized) print_log("") print_log("Total valid samples: " + str(len(table.samples))) @@ -91,7 +103,7 @@ def main(argv=sys.argv[1:]): for r, t in ranked_tables.items(): print_log("--- " + r + " ---") - filtered_trimmed_t = trim_table(filter_input_table(t, total, args.min_frequency, args.max_frequency, args.min_count, args.max_count)) + filtered_trimmed_t = trim_table(filter_input_table(t, total, args.min_frequency, args.max_frequency, args.min_count, args.max_count, normalized)) if t.empty: print_log("No valid entries, skipping") else: @@ -100,9 +112,10 @@ def main(argv=sys.argv[1:]): print_log("Total valid observations: " + str(len(table.observations(r)))) print_log("") - print_log("Total assigned (sum): " + str(table.total.sum() - table.unassigned.sum())) - print_log("Total unassigned (sum): " + str(table.unassigned.sum())) - print_log("") + if not normalized: + print_log("Total assigned (counts): " + str(table.total.sum() - table.unassigned.sum())) + print_log("Total unassigned (counts): " + str(table.unassigned.sum())) + print_log("") # Zero replacement try: @@ -147,7 +160,7 @@ def main(argv=sys.argv[1:]): # Run and load decontam results if args.decontam: print_log("- Running DECONTAM") - decontam = run_decontam(cfg, table, metadata, control_samples) + decontam = run_decontam(cfg, table, metadata, control_samples, normalized) print_log("") else: decontam = None @@ -182,7 +195,7 @@ def main(argv=sys.argv[1:]): # _p_ # df: index (unique observations), col|..., tax|..., aux|ref # this cds an exeption and contains data to plot (col|) and auxiliary data (tax|) - cds_p_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam) + cds_p_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam, normalized) # df: index (unique sample-ids), aux|..., bar|..., tax|... cds_p_samplebars = generate_cds_samplebars(table) # stacked: index (repeated observations), rank, ref, direct, parent @@ -206,7 +219,7 @@ def main(argv=sys.argv[1:]): # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors cds_p_obsbars = generate_cds_obsbars(table, args.top_obs_bars) # df: index (unique sample-ids), col|... - cds_p_sampletable = generate_cds_sampletable(table) + cds_p_sampletable = generate_cds_sampletable(table, normalized) # _d_ # dict: {rank: {obs: {sample: count}}} @@ -278,8 +291,8 @@ def main(argv=sys.argv[1:]): # samplebars ele["samplebars"] = {} - ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, max_total_count, table.ranks()) - ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, list(references.keys()), list(controls.keys()), decontam) + ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, max_total_count, table.ranks(), normalized) + ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, list(references.keys()), list(controls.keys()), decontam, normalized) # sampletable ele["sampletable"] = {} diff --git a/grimer/plots.py b/grimer/plots.py index e36c25c..2d21cb4 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -11,7 +11,7 @@ from grimer.utils import format_js_toString, make_color_palette -def plot_samplebars(cds_p_samplebars, max_total_count, ranks): +def plot_samplebars(cds_p_samplebars, max_total_count, ranks, normalized): # Bar plots has 3 main stacks: selection, others, unassigned # stacks can be annotated with references and controls samplebars_fig = figure(x_range=FactorRange(factors=cds_p_samplebars.data["aux|factors"]), @@ -89,7 +89,7 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks): samplebars_fig.xaxis.subgroup_label_orientation = "vertical" samplebars_fig.xaxis.axis_label = "samples" - samplebars_fig.yaxis[0].axis_label = "# counts" + samplebars_fig.yaxis[0].axis_label = "# counts" if not normalized else "% counts" samplebars_fig.yaxis[1].axis_label = "% observations" samplebars_fig.yaxis[1].axis_label_text_color = "#606c38" @@ -232,7 +232,7 @@ def plot_obsbars_widgets(ranks, metadata, dict_d_topobs, dict_d_taxname, top_obs "help_button": help_button(title="Observation bars", text=help_text)} -def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, decontam): +def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, decontam, normalized): annotbar_rank_select = Select(title="Annotate bars at rank:", value=ranks[0], options=[r for r in ranks]) annotbar_options = {} @@ -243,8 +243,12 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec annotbar_options["Decontam"] = ["decontam"] annotbar_select = Select(title="Annotate bars by:", value="assigned", options=annotbar_options) - y1_select = Select(title="Counts", value="#", options=["#", "%"], width=80) - y2_select = Select(title="Observations", value="%", options=["#", "%", "log10(#)", "log10(%)"], width=80) + if normalized: + y1_select = Select(title="Counts", value="%", options=["%"], width=80) + y2_select = Select(title="Observations", value="%", options=["%", "log10(%)"], width=80) + else: + y1_select = Select(title="Counts", value="#", options=["#", "%"], width=80) + y2_select = Select(title="Observations", value="%", options=["#", "%", "log10(#)", "log10(%)"], width=80) sort_options = {} sort_options["Default"] = [("input_order", "input order"), ("counts", "counts"), ("selected_annotation", "selected annotation")] @@ -448,7 +452,7 @@ def plot_decontam(sizes, cds_p_decontam, cds_p_decontam_lines, min_obs_perc): sizing_mode="stretch_width", tools="save") - palette = make_color_palette(2) + palette = make_color_palette(2) #Control, Sample factors = list(sorted(set(cds_p_decontam.data["controls"]), reverse=True)) # Add legend on top decontam_fig.add_layout(Legend(), 'above') @@ -996,8 +1000,7 @@ def plot_correlation(cds_p_correlation, ranks, dict_d_taxname): taxids.update(cds_p_correlation.data["taxid"]) corr_fig = figure(x_range=sorted(taxids, reverse=True), y_range=sorted(taxids), - tools="hover,save,reset,crosshair,tap,box_zoom", - tooltips="", + tools="save,reset,crosshair,tap,box_zoom", sizing_mode="scale_height") # Start showing only first rank diff --git a/grimer/table.py b/grimer/table.py index 5740455..e9ab0b5 100644 --- a/grimer/table.py +++ b/grimer/table.py @@ -2,13 +2,14 @@ class Table: - def __init__(self, samples, total, unassigned): + def __init__(self, samples, total, unassigned, lineage, normalized): # Ordered dict to keep rank insert order self.data = OrderedDict() - self.lineage = None + self.lineage = lineage self.samples = samples self.total = total self.unassigned = unassigned + self.normalized = normalized def __repr__(self): args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] @@ -23,6 +24,21 @@ def observations(self, rank): def ranks(self): return list(self.data.keys()) + def get_total(self): + return self.total + + def get_unassigned(self): + return self.unassigned + + def get_assigned(self): + return self.get_total() - self.get_unassigned() + + def get_unassigned_perc(self): + return self.get_unassigned().divide(self.get_total(), axis=0) if not self.normalized else self.get_unassigned().divide(100, axis=0) + + def get_assigned_perc(self): + return self.get_assigned().divide(self.get_total(), axis=0) if not self.normalized else self.get_assigned().divide(100, axis=0) + def get_lineage(self, taxid, rank, other_rank): # get lineage up-to requested rank return self.lineage[self.lineage[rank] == taxid][other_rank].values[0] @@ -33,17 +49,17 @@ def get_frequency(self, rank): def get_frequency_perc(self, rank): return self.get_frequency(rank) / len(self.samples) - def get_total_counts(self, rank): - return self.data[rank].sum(axis=0) + def get_counts(self, rank): + return self.data[rank].sum(axis=0) if not self.normalized else 0 def get_counts_perc(self, rank): - return self.data[rank].divide(self.total, axis=0) + return self.data[rank].divide(self.get_total(), axis=0) if not self.normalized else self.data[rank].divide(100, axis=0) - def get_counts_perc_avg(self, rank): + def get_counts_perc_avg_samples(self, rank): return self.get_counts_perc(rank).sum(axis=0) / len(self.samples) def get_top(self, rank, n): - return sorted(self.get_counts_perc_avg(rank).sort_values(ascending=False).index[:n].to_list()) + return sorted(self.get_counts_perc_avg_samples(rank).sort_values(ascending=False).index[:n].to_list()) def get_subtable(self, rank, samples: list=[], taxids: list=[], keep_shape: bool=False): subtable = self.data[rank] diff --git a/grimer/utils.py b/grimer/utils.py index 1044356..edf28a5 100644 --- a/grimer/utils.py +++ b/grimer/utils.py @@ -76,19 +76,19 @@ def parse_input_table(input_file, unassigned_header, transpose, sample_replace): print_log("- Trimming table") table_df = trim_table(table_df) - # Filter based on the table + # Filter based on the final table unassigned = unassigned.reindex(table_df.index) total = total.reindex(table_df.index) return table_df, total, unassigned -def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count): +def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count, normalized): if min_count: cnt = table_df.sum().sum() if min_count < 1: - table_df_norm = transform_table(table_df, total, "norm", 0) + table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df table_df = table_df[table_df_norm >= min_count].fillna(0) elif min_count > 1: table_df = table_df[table_df >= min_count].fillna(0) @@ -97,7 +97,7 @@ def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, if max_count: cnt = table_df.sum().sum() if max_count < 1: - table_df_norm = transform_table(table_df, total, "norm", 0) + table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df table_df = table_df[table_df_norm <= max_count].fillna(0) elif max_count > 1: table_df = table_df[table_df <= max_count].fillna(0) @@ -314,7 +314,7 @@ def update_tax_nodes(nodes, tax): return updated_nodes -def run_decontam(cfg, table, metadata, control_samples): +def run_decontam(cfg, table, metadata, control_samples, normalized): df_decontam = pd.DataFrame(index=table.samples, columns=["concentration", "controls"]) cfg_decontam = cfg["external"]["decontam"] tmp_output_prefix = "tmp_" @@ -344,11 +344,13 @@ def run_decontam(cfg, table, metadata, control_samples): else: print_log("Could not find " + cfg_decontam["frequency_metadata"] + " in the metadata. Skipping DECONTAM.") return None - else: + elif not normalized: # Use total from table - print_log("WARNING: Using total counts as frequency for DECONTAM") + print_log("No concentration provided, using total counts as concentration (frequency for DECONTAM)") df_decontam["concentration"] = table.total - + else: + print_log("Cannot run DECONTAM without concentration and normalized values") + return None # Print concentrations to file df_decontam["concentration"].to_csv(out_concentration, sep="\t", header=False, index=True) @@ -385,15 +387,18 @@ def run_decontam(cfg, table, metadata, control_samples): return None decontam = Decontam(df_decontam) - + # Run DECONTAM for each for each for rank in table.ranks(): if len(table.observations(rank)) == 1: decontam.add_rank_empty(rank, table.observations(rank)) else: - # normalized and write temporary table for each rank - transform_table(table.data[rank], table.total[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) + # normalize and write temporary table for each rank + if not normalized: + transform_table(table.data[rank], table.total[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) + else: + table.data[rank].to_csv(out_table, sep="\t", header=True, index=True) cmd = " ".join(["scripts/run_decontam.R", "--resout " + tmp_output_prefix + "decontam_out.tsv", From 83302895ed1ae2ea2e61f5a23035422635c62885 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 3 Mar 2022 14:34:26 +0100 Subject: [PATCH 41/50] metadata from biom, bugs --- grimer/callbacks.py | 16 ++++++++-- grimer/cds.py | 1 - grimer/config.py | 2 +- grimer/grimer.py | 72 +++++++++++++++++++++++++++------------------ grimer/js/func.js | 6 ++-- grimer/metadata.py | 10 +++++-- grimer/plots.py | 9 +++--- grimer/utils.py | 7 ++++- 8 files changed, 80 insertions(+), 43 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index c01a7be..0f5c304 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -657,8 +657,10 @@ def link_heatmap_widgets(ele, } } - for (let i = 0; i < cds_p_metadata.data["index"].length; i++) { - cds_p_metadata.data["factors"][i] = dict_factors[cds_p_metadata.data["index"][i]]; + if (cds_p_metadata){ + for (let i = 0; i < cds_p_metadata.data["index"].length; i++) { + cds_p_metadata.data["factors"][i] = dict_factors[cds_p_metadata.data["index"][i]]; + } } heatmap.y_range.factors = sorted_factors; @@ -696,6 +698,7 @@ def link_heatmap_widgets(ele, def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols): metadata_multiselect_callback = CustomJS( args=dict(metadata_heatmap=ele["metadata"]["fig"], + metadata_heatmap_xaxis=ele["metadata"]["fig"].xaxis[0], metadata_multiselect=ele["metadata"]["wid"]["metadata_multiselect"], legend_colorbars=ele["metadata"]["wid"]["legend_colorbars"], toggle_legend=ele["metadata"]["wid"]["toggle_legend"], @@ -704,11 +707,20 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols cds_d_metadata=cds_d_metadata), code=''' const index_len = cds_d_metadata.data["index"].length; + var x_factors = []; var empty_y_values = new Array(index_len); for (var i = 0; i < index_len; ++i) empty_y_values[i]=["", ""]; // hide all legends for (let md_header in legend_colorbars) legend_colorbars[md_header].visible = false; + + // set legend orientation + if(metadata_multiselect.value.length==1) + metadata_heatmap_xaxis.major_label_orientation = "horizontal"; + else + metadata_heatmap_xaxis.major_label_orientation = 0.7; + + console.log(metadata_heatmap_xaxis) for(var s=0; s < max_metadata_cols; ++s){ if (s Date: Thu, 3 Mar 2022 14:43:59 +0100 Subject: [PATCH 42/50] metadata biom bug --- grimer/callbacks.py | 12 +++++++----- grimer/grimer.py | 10 +++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 0f5c304..b1647f9 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -698,7 +698,7 @@ def link_heatmap_widgets(ele, def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols): metadata_multiselect_callback = CustomJS( args=dict(metadata_heatmap=ele["metadata"]["fig"], - metadata_heatmap_xaxis=ele["metadata"]["fig"].xaxis[0], + metadata_heatmap_xaxis=ele["metadata"]["fig"].xaxis[0] if cds_p_metadata else None, metadata_multiselect=ele["metadata"]["wid"]["metadata_multiselect"], legend_colorbars=ele["metadata"]["wid"]["legend_colorbars"], toggle_legend=ele["metadata"]["wid"]["toggle_legend"], @@ -715,10 +715,12 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols for (let md_header in legend_colorbars) legend_colorbars[md_header].visible = false; // set legend orientation - if(metadata_multiselect.value.length==1) - metadata_heatmap_xaxis.major_label_orientation = "horizontal"; - else - metadata_heatmap_xaxis.major_label_orientation = 0.7; + if(metadata_heatmap_xaxis){ + if(metadata_multiselect.value.length==1) + metadata_heatmap_xaxis.major_label_orientation = "horizontal"; + else + metadata_heatmap_xaxis.major_label_orientation = 0.7; + } console.log(metadata_heatmap_xaxis) for(var s=0; s < max_metadata_cols; ++s){ diff --git a/grimer/grimer.py b/grimer/grimer.py index 6e2611f..1484f2c 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -139,9 +139,13 @@ def main(argv=sys.argv[1:]): if args.metadata: metadata = Metadata(metadata_file=args.metadata, samples=table.samples.to_list()) elif args.input_file.endswith(".biom"): - biom_in = biom.load_table(args.input_file) - if biom_in.metadata() is not None: - metadata = Metadata(metadata_table=biom_in.metadata_to_dataframe(axis="sample"), samples=table.samples.to_list()) + try: + biom_in = biom.load_table(args.input_file) + if biom_in.metadata() is not None: + metadata = Metadata(metadata_table=biom_in.metadata_to_dataframe(axis="sample"), samples=table.samples.to_list()) + except: + metadata = None + print_log("Error parsing metadata from BIOM file") if metadata is None or metadata.data.empty: metadata = None From 01b0cde1e2a87492b6e00b7928c1c35811589e33 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Thu, 3 Mar 2022 15:02:26 +0100 Subject: [PATCH 43/50] bug group heatmap samples --- grimer/callbacks.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index b1647f9..820b549 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -636,18 +636,18 @@ def link_heatmap_widgets(ele, }else if (y_groupby_select.value.startsWith("group_metadata|")){ const group_metadata = y_groupby_select.value.replace("group_metadata|",""); - // group entries without metadata with space " " - var groupby_col = cds_d_metadata.data[group_metadata]; + // group entries and replace empty with space " " + var groupby_col = cds_d_metadata.data[group_metadata].map(function(m) { return m == "" ? " " : m; }); + var factors = []; for(let i = 0; i < annot_samples.length; i++){ dict_factors[annot_samples[i]] = [groupby_col[i], annot_samples[i]]; + factors.push([groupby_col[i], annot_samples[i]]); } - sorted_factors = grimer_sort(Object.values(dict_factors), sort_col, sort_col_type, false, groupby_col); - + sorted_factors = grimer_sort(factors, sort_col, sort_col_type, false, groupby_col); } } - // update factors on heatmap col otherwise remove for (let i = 0; i < cds_p_heatmap.data["index"].length; i++) { if(cds_p_heatmap.data["rank"][i]==rank){ @@ -722,7 +722,6 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols metadata_heatmap_xaxis.major_label_orientation = 0.7; } - console.log(metadata_heatmap_xaxis) for(var s=0; s < max_metadata_cols; ++s){ if (s Date: Fri, 4 Mar 2022 13:43:35 +0100 Subject: [PATCH 44/50] code org and refactor --- grimer/callbacks.py | 88 ++--- grimer/cds.py | 133 +++---- grimer/config.py | 2 +- grimer/func.py | 833 ++++++++++++++++++++++++++++++++++++++++++++ grimer/grimer.py | 292 +++++----------- grimer/metadata.py | 1 - grimer/plots.py | 53 +-- grimer/table.py | 6 +- 8 files changed, 1060 insertions(+), 348 deletions(-) create mode 100644 grimer/func.py diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 820b549..7b07b7a 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -2,7 +2,7 @@ def link_obstable_samplebars(ele, - cds_p_obstable, + cds_m_obstable, cds_p_samplebars, cds_d_samples, dict_d_sampleobs, @@ -139,7 +139,7 @@ def link_obstable_samplebars(ele, args=dict(y2_select=ele["samplebars"]["wid"]["y2_select"], cds_p_samplebars=cds_p_samplebars, cds_d_samples=cds_d_samples, - cds_p_obstable=cds_p_obstable, + cds_m_obstable=cds_m_obstable, dict_d_sampleobs=dict_d_sampleobs, y_range=ele["samplebars"]["fig"].extra_y_ranges['obs'], min_obs_perc=min_obs_perc, @@ -147,7 +147,7 @@ def link_obstable_samplebars(ele, active_ranks=active_ranks), code=''' // get selected row from obstable [0 to get just the first] - var row = cds_p_obstable.selected.indices[0]; + var row = cds_m_obstable.selected.indices[0]; if (row!=undefined){ // get totals const total = cds_d_samples.data["cnt|total"]; @@ -156,7 +156,7 @@ def link_obstable_samplebars(ele, // get rank let rank = active_ranks[r]; // get taxid of the rank - let taxid = cds_p_obstable.data["tax|"+rank][row]; + let taxid = cds_m_obstable.data["tax|"+rank][row]; // for each sample for (var i = 0; i < cds_d_samples.length; i++) { let sample = cds_d_samples.data["index"][i]; @@ -197,19 +197,19 @@ def link_obstable_samplebars(ele, ''') change_text_legend_obs_callback = CustomJS( - args=dict(cds_p_obstable=cds_p_obstable, + args=dict(cds_m_obstable=cds_m_obstable, legend_obs=ele["samplebars"]["legend_obs"], samplebars=ele["samplebars"]["fig"], active_ranks=active_ranks), code=''' // selected row const row = cb_obj.indices[0]; - const selected_rank = cds_p_obstable.data['col|rank'][row]; + const selected_rank = cds_m_obstable.data['col|rank'][row]; for(let r = 0; r < active_ranks.length; r++){ - let taxid = cds_p_obstable.data["tax|"+active_ranks[r]][row]; + let taxid = cds_m_obstable.data["tax|"+active_ranks[r]][row]; if (taxid){ - legend_obs.items[r].label = active_ranks[r] + "|" + cds_p_obstable.data['col|name'][cds_p_obstable.data['index'].indexOf(taxid)]; + legend_obs.items[r].label = active_ranks[r] + "|" + cds_m_obstable.data['col|name'][cds_m_obstable.data['index'].indexOf(taxid)]; }else{ legend_obs.items[r].label = active_ranks[r]; } @@ -232,7 +232,7 @@ def link_obstable_samplebars(ele, load_infopanel = CustomJS( args=dict(infopanel=ele["infopanel"]["textarea"], - cds_p_obstable=cds_p_obstable, + cds_m_obstable=cds_m_obstable, dict_d_refs=dict_d_refs, dict_d_taxname=dict_d_taxname, active_ranks=active_ranks), @@ -240,9 +240,9 @@ def link_obstable_samplebars(ele, // selected row var row = cb_obj.indices[0]; - const name = cds_p_obstable.data['col|name'][row]; - const rank = cds_p_obstable.data['col|rank'][row]; - const taxid = cds_p_obstable.data['index'][row]; + const name = cds_m_obstable.data['col|name'][row]; + const rank = cds_m_obstable.data['col|rank'][row]; + const taxid = cds_m_obstable.data['index'][row]; var text = ""; text+="[ Obs ]"; @@ -260,7 +260,7 @@ def link_obstable_samplebars(ele, var lineage = ""; for(let r = 0; r < active_ranks.length; r++){ - var obs_lin = cds_p_obstable.data["tax|" + active_ranks[r]][row]; + var obs_lin = cds_m_obstable.data["tax|" + active_ranks[r]][row]; if(taxid!=name){ if(dict_d_taxname[obs_lin]) lineage+=dict_d_taxname[obs_lin]+" | "; @@ -297,7 +297,7 @@ def link_obstable_samplebars(ele, decontam_callback = CustomJS( args=dict(cds_d_samples=cds_d_samples, - cds_p_obstable=cds_p_obstable, + cds_m_obstable=cds_m_obstable, dict_d_sampleobs=dict_d_sampleobs, cds_p_decontam=cds_p_decontam, cds_p_decontam_models=cds_p_decontam_models, @@ -306,8 +306,8 @@ def link_obstable_samplebars(ele, code=''' // selected row const row = cb_obj.indices[0]; - const taxid = cds_p_obstable.data["index"][row]; - const rank = cds_p_obstable.data["col|rank"][row]; + const taxid = cds_m_obstable.data["index"][row]; + const rank = cds_m_obstable.data["col|rank"][row]; const total = cds_d_samples.data["cnt|total"]; for(let i = 0; i < cds_p_decontam.length; i++){ let sample = cds_p_decontam.data["index"][i]; @@ -336,14 +336,14 @@ def link_obstable_samplebars(ele, args=dict(mgnify_fig=ele["mgnify"]["fig"], biome_spinner=ele["mgnify"]["wid"]["biome_spinner"], mgnify_filter=ele["mgnify"]["filter"], - cds_p_obstable=cds_p_obstable, + cds_m_obstable=cds_m_obstable, cds_p_mgnify=cds_p_mgnify), code=''' // selected row - const row = cds_p_obstable.selected.indices[0]; + const row = cds_m_obstable.selected.indices[0]; const indices = []; if (row!=undefined){ - const taxid = cds_p_obstable.data["index"][row]; + const taxid = cds_m_obstable.data["index"][row]; for(let i = 0; i < cds_p_mgnify.length; i++){ if(cds_p_mgnify.data["taxa"][i]==taxid && cds_p_mgnify.data["level"][i]==biome_spinner.value.toString()){ @@ -359,19 +359,19 @@ def link_obstable_samplebars(ele, args=dict(references_fig=ele["references"]["fig"], references_filter=ele["references"]["filter"], references_select=ele["references"]["wid"]["references_select"], - cds_p_obstable=cds_p_obstable, + cds_m_obstable=cds_m_obstable, cds_p_references=cds_p_references, active_ranks=active_ranks), code=''' // selected row - const row = cds_p_obstable.selected.indices[0]; + const row = cds_m_obstable.selected.indices[0]; const indices = []; if (row!=undefined){ for(let i = 0; i < cds_p_references.length; i++){ // for each rank for(let r = 0; r < active_ranks.length; r++){ // get taxid of the rank - let rank_obs = cds_p_obstable.data["tax|"+active_ranks[r]][row]; + let rank_obs = cds_m_obstable.data["tax|"+active_ranks[r]][row]; if(cds_p_references.data["obs"][i]==rank_obs && cds_p_references.data["rank"][i]==active_ranks[r] && cds_p_references.data["ref"][i]==references_select.value){ @@ -404,7 +404,7 @@ def link_obstable_samplebars(ele, if ele["references"]["filter"]: obstable_callbacks.append(references_callback) - cds_p_obstable.selected.js_on_change('indices', *obstable_callbacks) + cds_m_obstable.selected.js_on_change('indices', *obstable_callbacks) ele["samplebars"]["wid"]["sort_select"].js_on_change('value', sort_groupby_callback) ele["samplebars"]["wid"]["groupby1_select"].js_on_change('value', sort_groupby_callback) @@ -429,7 +429,7 @@ def link_heatmap_widgets(ele, dict_d_dedro_x, dict_d_dedro_y, cds_p_annotations, - cds_p_obstable, + cds_m_obstable, cds_p_heatmap, active_ranks, dict_d_taxname): @@ -465,22 +465,22 @@ def link_heatmap_widgets(ele, x_groupby_select=ele["heatmap"]["wid"]["x_groupby_select"], dict_d_hcluster_x=dict_d_hcluster_x, cds_p_annotations=cds_p_annotations, - cds_p_obstable=cds_p_obstable, + cds_m_obstable=cds_m_obstable, cds_p_heatmap=cds_p_heatmap, dict_d_taxname=dict_d_taxname), code=''' // selected rank const rank = rank_select.value; - // get index to access data from observations from cds_p_obstable + // get index to access data from observations from cds_m_obstable var obs_index = []; - for (let i = 0; i < cds_p_obstable.data["index"].length; i++) { - if(cds_p_obstable.data["col|rank"][i]==rank){ + for (let i = 0; i < cds_m_obstable.data["index"].length; i++) { + if(cds_m_obstable.data["col|rank"][i]==rank){ obs_index.push(i); } } - var annot_obs = obs_index.map( s => cds_p_obstable.data["index"][s] ); + var annot_obs = obs_index.map( s => cds_m_obstable.data["index"][s] ); var sorted_factors = []; var dict_factors = {}; @@ -498,10 +498,10 @@ def link_heatmap_widgets(ele, if (x_sort_select.value=="none"){ sort_col = obs_index; }else if (x_sort_select.value=="observations"){ - sort_col = obs_index.map( s => cds_p_obstable.data["col|name"][s] ); + sort_col = obs_index.map( s => cds_m_obstable.data["col|name"][s] ); sort_col_type = "string"; }else if (x_sort_select.value=="counts"){ - sort_col = obs_index.map( s => cds_p_obstable.data["col|total_counts"][s] ); + sort_col = obs_index.map( s => cds_m_obstable.data["col|total_counts"][s] ); }else if (x_sort_select.value.startsWith("annot|")){ const annot = x_sort_select.value.replace("annot|",""); // create array with zeros, mark with one if annotation is present @@ -524,7 +524,7 @@ def link_heatmap_widgets(ele, // if grouping with a higher rank if(active_ranks.indexOf(rank) > active_ranks.indexOf(group_rank)){ // group entries without selected rank with space " " - var groupby_col = obs_index.map(function(s) { return cds_p_obstable.data["tax|" + group_rank][s] == "" ? " " : dict_d_taxname[cds_p_obstable.data["tax|" + group_rank][s]]; }); + var groupby_col = obs_index.map(function(s) { return cds_m_obstable.data["tax|" + group_rank][s] == "" ? " " : dict_d_taxname[cds_m_obstable.data["tax|" + group_rank][s]]; }); var factors = []; for(let i = 0; i < annot_obs.length; i++){ dict_factors[annot_obs[i]] = [groupby_col[i], annot_obs[i]]; @@ -756,11 +756,11 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols ele["metadata"]["wid"]["toggle_legend"].js_on_click(metadata_multiselect_callback) -def link_obstable_filter(ele, cds_p_obstable, active_ranks): +def link_obstable_filter(ele, cds_m_obstable, active_ranks): filter_callback = CustomJS( - args=dict(cds_p_obstable=cds_p_obstable, + args=dict(cds_m_obstable=cds_m_obstable, active_ranks=active_ranks, - widgets_filter=ele["obstable"]["widgets_filter"], + filter=ele["obstable"]["filter"], frequency_spinner=ele["obstable"]["wid"]["frequency_spinner"], counts_perc_avg_spinner=ele["obstable"]["wid"]["counts_perc_avg_spinner"], total_counts_spinner=ele["obstable"]["wid"]["total_counts_spinner"], @@ -768,21 +768,21 @@ def link_obstable_filter(ele, cds_p_obstable, active_ranks): ), code=''' const indices = []; - for (var i = 0; i < cds_p_obstable.length; i++) { - if (cds_p_obstable.data['col|frequency_perc'][i] < (frequency_spinner.value/100)){ + for (var i = 0; i < cds_m_obstable.length; i++) { + if (cds_m_obstable.data['col|frequency_perc'][i] < (frequency_spinner.value/100)){ continue; } - if (cds_p_obstable.data['col|counts_perc_avg'][i] < (counts_perc_avg_spinner.value/100)){ + if (cds_m_obstable.data['col|counts_perc_avg'][i] < (counts_perc_avg_spinner.value/100)){ continue; } - if (cds_p_obstable.data['col|total_counts'][i] < (total_counts_spinner.value)){ + if (cds_m_obstable.data['col|total_counts'][i] < (total_counts_spinner.value)){ continue; } if (name_multichoice.value.length > 0 ){ var found = false; for(let r = 0; r < active_ranks.length; r++){ // Compare all names on multichoice (array) against cell - if (name_multichoice.value.indexOf(cds_p_obstable.data["tax|"+active_ranks[r]][i]) >= 0){ + if (name_multichoice.value.indexOf(cds_m_obstable.data["tax|"+active_ranks[r]][i]) >= 0){ found = true; break; } @@ -793,8 +793,8 @@ def link_obstable_filter(ele, cds_p_obstable, active_ranks): } indices.push(i); } - widgets_filter.indices = indices; - cds_p_obstable.change.emit(); + filter.indices = indices; + cds_m_obstable.change.emit(); ''') ele["obstable"]["wid"]["frequency_spinner"].js_on_change('value', filter_callback) ele["obstable"]["wid"]["counts_perc_avg_spinner"].js_on_change('value', filter_callback) @@ -821,7 +821,7 @@ def link_correlation_widgets(ele, cds_p_correlation): ''') filter_callback = CustomJS( - args=dict(rho_filter=ele["correlation"]["rho_filter"], + args=dict(filter=ele["correlation"]["filter"], neg_slider=ele["correlation"]["wid"]["neg_slider"], pos_slider=ele["correlation"]["wid"]["pos_slider"], cds_p_correlation=cds_p_correlation), @@ -835,7 +835,7 @@ def link_correlation_widgets(ele, cds_p_correlation): indices.push(i) } } - rho_filter.indices = indices; + filter.indices = indices; cds_p_correlation.change.emit(); ''') diff --git a/grimer/cds.py b/grimer/cds.py index 71f10a8..775de2e 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -4,7 +4,7 @@ from math import pi #Internal -from grimer.utils import print_df, transform_table, print_log, pairwise_rho, format_js_toString +from grimer.func import print_df, transform_table, print_log, pairwise_rho, format_js_toString #Bokeh from bokeh.models import ColumnDataSource @@ -27,13 +27,14 @@ def generate_cds_plot_references(table, tax, references): # index -> observations (repeated) # columns -> "rank", "ref", "direct", "parent" clist = [] - for rank in table.ranks(): - for obs in table.observations(rank): - for desc, ref in references.items(): - direct = ref.get_refs_count(obs, direct=True) - parent = ref.get_refs_count(obs, parents=True) - if direct + parent > 0: - clist.append([obs, rank, desc, direct, parent]) + if references is not None: + for rank in table.ranks(): + for obs in table.observations(rank): + for desc, ref in references.items(): + direct = ref.get_refs_count(obs, direct=True) + parent = ref.get_refs_count(obs, parents=True) + if direct + parent > 0: + clist.append([obs, rank, desc, direct, parent]) df_references = pd.DataFrame(clist, columns=["obs", "rank", "ref", "direct", "parent"]) df_references.set_index('obs', inplace=True) @@ -57,11 +58,12 @@ def generate_cds_annotations(table, references, controls, decontam, control_samp if contaminants.any(): df_rank["decontam"] = decontam.get_pscore(rank, df_rank.index)[contaminants] - for desc, ref in references.items(): - df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) - df_rank.loc[df_rank[desc] == 0, desc] = np.nan + if references is not None: + for desc, ref in references.items(): + df_rank[desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)) + df_rank.loc[df_rank[desc] == 0, desc] = np.nan - if controls: + if controls is not None: for desc, ctrl in controls.items(): control_table = table.get_subtable(samples=control_samples[desc], rank=rank) freq_perc_control = control_table.gt(0).sum(axis=0) / control_table.shape[0] @@ -78,13 +80,14 @@ def generate_cds_annotations(table, references, controls, decontam, control_samp df_rank.loc[df_rank["annot"] == "decontam", "tv"] = 1 - ((df_rank[df_rank["annot"] == "decontam"]["ov"] - min_val) / (max_val - min_val)) # max references divided by max - for desc, ref in references.items(): - if not df_rank[df_rank["annot"] == desc].empty: - max_val = df_rank[df_rank["annot"] == desc]["ov"].max() - df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] / max_val + if references is not None: + for desc, ref in references.items(): + if not df_rank[df_rank["annot"] == desc].empty: + max_val = df_rank[df_rank["annot"] == desc]["ov"].max() + df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] / max_val # keep same percentage - if controls: + if controls is not None: for desc, ctrl in controls.items(): if not df_rank.loc[df_rank["annot"] == desc].empty: df_rank.loc[df_rank["annot"] == desc, "tv"] = df_rank.loc[df_rank["annot"] == desc, "ov"] @@ -99,7 +102,7 @@ def generate_cds_annotations(table, references, controls, decontam, control_samp return ColumnDataSource(df_annotations) -def generate_cds_obstable(table, tax, references, controls, control_samples, decontam, normalized): +def generate_cds_obstable(table, tax, references, controls, control_samples, decontam): # index unique taxids # col|... values to plot to columns in the datatable # tax|... auxiliary lineage of taxa entries @@ -127,11 +130,12 @@ def generate_cds_obstable(table, tax, references, controls, control_samples, dec df_rank["col|decontam"] = decontam.get_contaminants(rank, df_rank.index) # Add a column for each Annotation source - for desc, ref in references.items(): - df_rank["col|" + desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)).to_list() + if references is not None: + for desc, ref in references.items(): + df_rank["col|" + desc] = table.observations(rank).map(lambda x: ref.get_refs_count(x, direct=True)).to_list() # Add a column for each Control source - if controls: + if controls is not None: # calculate frequency for each group of control provided for desc, ctrl in controls.items(): control_table = table.get_subtable(samples=control_samples[desc], rank=rank) @@ -152,19 +156,19 @@ def generate_cds_obstable(table, tax, references, controls, control_samples, dec # Concat in the main df df_obstable = pd.concat([df_obstable, df_rank], axis=0) - print_df(df_obstable, "cds_p_obstable") + print_df(df_obstable, "cds_m_obstable") return ColumnDataSource(df_obstable) -def generate_cds_sampletable(table, normalized): +def generate_cds_sampletable(table): # index unique sample-ids # col|... values to plot to columns in the datatable df_sampletable = pd.DataFrame(index=table.samples) - df_sampletable["col|total"] = table.get_total() if not normalized else 0 - df_sampletable["col|assigned"] = table.get_assigned() if not normalized else 0 + df_sampletable["col|total"] = table.get_total() if not table.normalized else 0 + df_sampletable["col|assigned"] = table.get_assigned() if not table.normalized else 0 df_sampletable["col|assigned_perc"] = table.get_assigned_perc() - df_sampletable["col|unassigned"] = table.get_unassigned() if not normalized else 0 + df_sampletable["col|unassigned"] = table.get_unassigned() if not table.normalized else 0 df_sampletable["col|unassigned_perc"] = table.get_unassigned_perc() # assigned by rank @@ -218,8 +222,10 @@ def generate_cds_samples(table, references, controls, decontam): df_samples["cnt|" + rank + "|assigned"] = table.data[rank].sum(axis=1) # Add counts specific to sources - source_list = [references.items()] - if controls: + source_list = [] + if references is not None: + source_list.append(references.items()) + if controls is not None: source_list.append(controls.items()) for sources in source_list: @@ -324,7 +330,7 @@ def generate_dict_sampleobs(table): return dict_sampleobs -def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): +def generate_cds_heatmap(table, transformation, show_zeros): # Stacked matrix of raw counts + transformed value # index -> sample-ids (repeated) # obs @@ -338,7 +344,7 @@ def generate_cds_heatmap(table, transformation, replace_zero_value, show_zeros): # Rename first col to obs stacked_rank_df.rename(columns={stacked_rank_df.columns[0]: "obs"}, inplace=True) stacked_rank_df["rank"] = rank - tv = transform_table(table.data[rank], table.total, transformation, replace_zero_value) + tv = transform_table(table.data[rank], table.total, transformation, table.zerorep) stacked_rank_df["tv"] = tv.stack().values #Drop zeros based on original counts if not show_zeros: @@ -431,61 +437,36 @@ def generate_dict_refs(table, references): for rank in table.ranks(): used_ids.update(table.observations(rank)) - for i in used_ids: - for sname, s in references.items(): - for ref, descs in s.get_refs_desc(i, direct=True).items(): - for desc in descs: - # Only add items if they have a reference to it - if i not in d_refs: - d_refs[i] = {} - if sname not in d_refs[i]: - d_refs[i][sname] = {} - if desc not in d_refs[i][sname]: - d_refs[i][sname][desc] = [] - d_refs[i][sname][desc].append(ref) + if references is not None: + for i in used_ids: + for sname, s in references.items(): + for ref, descs in s.get_refs_desc(i, direct=True).items(): + for desc in descs: + # Only add items if they have a reference to it + if i not in d_refs: + d_refs[i] = {} + if sname not in d_refs[i]: + d_refs[i][sname] = {} + if desc not in d_refs[i][sname]: + d_refs[i][sname][desc] = [] + d_refs[i][sname][desc].append(ref) print_df(d_refs, "dict_d_refs") return d_refs -def generate_cds_correlation(table, top_obs_corr, replace_zero_value): - # index (repeated taxids) - # other taxid - # rank - # rho - +def generate_cds_correlation(table, corr): df_corr = pd.DataFrame(columns=["taxid", "rank", "rho"]) for rank in table.ranks(): - if top_obs_corr: - top_taxids = sorted(table.get_top(rank, top_obs_corr)) - matrix = table.get_subtable(taxids=top_taxids, rank=rank) - else: - top_taxids = sorted(table.observations(rank)) - matrix = table.data[rank] - - # No correlation with just one observation - if len(matrix.columns) >= 2: - - rho = pairwise_rho(transform_table(matrix, 0, "clr", replace_zero_value).values) - - if len(matrix.columns) == 2: - # If there are only 2 observations, return in a float - # re-format in a matrix shape - rho = np.array([[np.nan, np.nan], [rho[1, 0], np.nan]]) - else: - # fill upper triangular matrix (mirrored values) with nan to be ignored by pandas - # to save half of the space - rho[np.triu_indices(rho.shape[0])] = np.nan - - stacked_rank_df = pd.DataFrame(rho, index=top_taxids, columns=top_taxids).stack(dropna=False).reset_index(1) - stacked_rank_df.rename(columns={"level_1": "taxid"}, inplace=True) - stacked_rank_df.rename(columns={0: "rho"}, inplace=True) - stacked_rank_df["rank"] = rank + stacked_rank_df = pd.DataFrame(corr[rank]["rho"], index=corr[rank]["observations"], columns=corr[rank]["observations"]).stack(dropna=False).reset_index(1) + stacked_rank_df.rename(columns={"level_1": "taxid"}, inplace=True) + stacked_rank_df.rename(columns={0: "rho"}, inplace=True) + stacked_rank_df["rank"] = rank - # Drop NA for rho (missing values and upper triangular matrix) - stacked_rank_df.dropna(subset=['rho'], inplace=True) + # Drop NA for rho (missing values and upper triangular matrix) + stacked_rank_df.dropna(subset=['rho'], inplace=True) - df_corr = pd.concat([df_corr, stacked_rank_df], axis=0) + df_corr = pd.concat([df_corr, stacked_rank_df], axis=0) print_df(df_corr, "cds_p_correlation") return ColumnDataSource(df_corr) diff --git a/grimer/config.py b/grimer/config.py index 3772c26..aafa64b 100644 --- a/grimer/config.py +++ b/grimer/config.py @@ -16,7 +16,7 @@ def __new__(self, argv=None): parser.add_argument('-i', '--input-file', required=True, type=str, help="Main input table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --tranpose if your file is reversed). First column and first row are used as headers.") parser.add_argument('-c', '--config', type=str, help="Configuration file") - parser.add_argument('-m', '--metadata', type=str, help="Input metadata file in simple tabular format. Sample identifiers will be matched with ones provided by --input-table. QIIME 2 metadata format is also accepted, with categorical and numerical fields.") + parser.add_argument('-m', '--metadata-file', type=str, help="Input metadata file in simple tabular format. Sample identifiers will be matched with ones provided by --input-table. QIIME 2 metadata format is also accepted, with categorical and numerical fields.") parser.add_argument('-t', '--tax', type=str, default=None, help="Define taxonomy to use. By default, do not use any taxonomy.", choices=["ncbi", "gtdb", "silva", "greengenes", "ott"]) parser.add_argument('-b', '--tax-files', nargs="*", type=str, default=None, help="Taxonomy files. If not provided, will automatically be downloaded.") parser.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") diff --git a/grimer/func.py b/grimer/func.py new file mode 100644 index 0000000..5aeea19 --- /dev/null +++ b/grimer/func.py @@ -0,0 +1,833 @@ +#General +import numpy as np +import os +import sys +import subprocess +import shlex +import pandas as pd +import yaml + +#Internal +from grimer.decontam import Decontam +from grimer.metadata import Metadata +from grimer.reference import Reference +from grimer.mgnify import MGnify +from grimer.table import Table + +# Bokeh +from bokeh.palettes import Category10, Category20, Colorblind, linear_palette, Turbo256 + +# MultiTax +from multitax import * + +#biom +import biom + +# scikit-bio +from skbio.stats.composition import clr + +# Scipy +import scipy.cluster.hierarchy as sch + + +def parse_config_file(config): + """ + parse yaml configuration file + """ + cfg = None + if config: + with open(config, 'r') as file: + cfg = yaml.safe_load(file) + return cfg + + +def parse_taxonomy(taxonomy, tax_files): + tax = None + if taxonomy == "ncbi": + tax = NcbiTx(files=tax_files, extended_names=True) + elif taxonomy == "gtdb": + tax = GtdbTx(files=tax_files) + elif taxonomy == "silva": + tax = SilvaTx(files=tax_files) + elif taxonomy == "greengenes": + tax = GreengenesTx(files=tax_files) + elif taxonomy == "ott": + tax = OttTx(files=tax_files, extended_names=True) + return tax + + +def parse_table(args, tax): + table = None + + # Specific params if biom file is provided + if args.input_file.endswith(".biom"): + if not args.level_separator: + args.level_separator = ";" + args.transpose = True + + # Read and return full table with separated total and unassigned counts (sharing same index) + table_df, total, unassigned = parse_input_file(args.input_file, args.unassigned_header, args.transpose, args.sample_replace) + + # Define if table is already normalized (0-100) or has count data + if args.values == "count": + normalized = False + elif args.values == "normalized": + normalized = True + elif (table_df.sum(axis=1).round() == 100).all() or (table_df % 1 != 0).any().any(): + normalized = True + else: + normalized = False + + # Zero replacement + try: + replace_zero_value = table_df[table_df.gt(0)].min().min() / int(args.replace_zeros) + except: + replace_zero_value = float(args.replace_zeros) + if replace_zero_value == 1 and args.transformation == "log": + replace_zero_value = 0.999999 # Do not allow value 1 using log + + # Split table into ranks. Ranks are either in the headers in multi level tables or will be created for a one level table + if args.level_separator: + ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace) + else: + ranked_tables, lineage = parse_single_table(table_df, args.ranks, tax, Config.default_rank_name) + + if not ranked_tables: + print_log("Could not parse input table") + return 1 + + table = Table(table_df.index, total, unassigned, lineage, normalized, replace_zero_value) + + print_log("") + print_log("Total valid samples: " + str(len(table.samples))) + # Check for long sample headers, break some plots + long_sample_headers = [h for h in table_df.index if len(h) > 70] + if long_sample_headers: + print_log("Long sample labels/headers detected, plots may break: ") + print_log("\n".join(long_sample_headers)) + print_log("") + + for r, t in ranked_tables.items(): + print_log("--- " + r + " ---") + filtered_trimmed_t = trim_table(filter_input_table(t, total, args.min_frequency, args.max_frequency, args.min_count, args.max_count, normalized)) + if t.empty: + print_log("No valid entries, skipping") + else: + # Trim table for empty zeros rows/cols + table.add_rank(r, filtered_trimmed_t) + print_log("Total valid observations: " + str(len(table.observations(r)))) + + print_log("") + if not normalized: + print_log("Total assigned (counts): " + str(table.total.sum() - table.unassigned.sum())) + print_log("Total unassigned (counts): " + str(table.unassigned.sum())) + print_log("") + + return table + + +def parse_metadata(args, table): + metadata = None + + if args.metadata_file: + metadata = Metadata(metadata_file=args.metadata_file, samples=table.samples.to_list()) + elif args.input_file.endswith(".biom"): + try: + biom_in = biom.load_table(args.input_file) + if biom_in.metadata() is not None: + metadata = Metadata(metadata_table=biom_in.metadata_to_dataframe(axis="sample"), samples=table.samples.to_list()) + except: + metadata = None + print_log("Error parsing metadata from BIOM file") + + if metadata is None or metadata.data.empty: + metadata = None + print_log("No valid metadata") + else: + print_log("Samples: " + str(metadata.data.shape[0])) + print_log("Numeric Fields: " + str(metadata.get_data("numeric").shape[1])) + print_log("Categorical Fields: " + str(metadata.get_data("categorical").shape[1])) + if len(metadata.get_col_headers()) < args.metadata_cols: + args.metadata_cols = len(metadata.get_col_headers()) + print_log("") + + return metadata + + +def parse_references(cfg, tax, taxonomy, ranks): + references = None + if "references" in cfg and taxonomy == "ncbi": + references = {} + for desc, sf in cfg["references"].items(): + references[desc] = Reference(file=sf) + if tax: + # Update taxids / get taxid from name + references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) + for i in list(references[desc].ids.keys()): + # lineage of all parent nodes (without itself) + for l in tax.lineage(i)[:-1]: + references[desc].add_parent(l, i) + return references + + +def parse_controls(cfg, table): + controls = None + control_samples = None + + if "controls" in cfg: + controls = {} + control_samples = {} + for desc, cf in cfg["controls"].items(): + with open(cf, "r") as file: + samples = file.read().splitlines() + obs = set() + valid_samples = set() + for rank in table.ranks(): + # Retrieve sub-table for every rank + control_table = table.get_subtable(rank, samples=samples) + obs.update(control_table.columns.to_list()) + valid_samples.update(control_table.index.to_list()) + + # Add control observations as a reference + controls[desc] = Reference(ids=obs) + control_samples[desc] = list(valid_samples) + + return controls, control_samples + + +def parse_mgnify(run_mgnify, cfg, tax, ranks): + mgnify = None + if run_mgnify: + if cfg and "mgnify" in cfg["external"]: + print_log("- Parsing MGNify") + mgnify = MGnify(cfg["external"]["mgnify"], ranks=ranks) + if tax: + mgnify.update_taxids(update_tax_nodes([tuple(x) for x in mgnify.data[["rank", "taxa"]].to_numpy()], tax)) + print_log("") + else: + print("Configuration file not found. Skipping MGnify") + print_log("") + + return mgnify + + +def run_correlation(table, top_obs_corr): + corr = {} + for rank in table.ranks(): + corr[rank] = {} + if top_obs_corr: + top_taxids = sorted(table.get_top(rank, top_obs_corr)) + matrix = table.get_subtable(taxids=top_taxids, rank=rank) + else: + top_taxids = sorted(table.observations(rank)) + matrix = table.data[rank] + + corr[rank]["observations"] = top_taxids + corr[rank]["rho"] = [] + # No correlation with just one observation + if len(matrix.columns) >= 2: + rho = pairwise_rho(transform_table(matrix, 0, "clr", table.zerorep).values) + if len(matrix.columns) == 2: + # If there are only 2 observations, return in a float + # re-format in a matrix shape + rho = np.array([[np.nan, np.nan], [rho[1, 0], np.nan]]) + else: + # fill upper triangular matrix (mirrored values) with nan to be ignored by pandas + # to save half of the space + rho[np.triu_indices(rho.shape[0])] = np.nan + + corr[rank]["rho"] = rho + + return corr + + +def parse_input_file(input_file, unassigned_header, transpose, sample_replace): + + if input_file.endswith(".biom"): + table_df = biom.load_table(input_file).to_dataframe(dense=True) + else: + # Default input_file: index=observations, columns=samples + # table_df should have samples on indices and observations on columns + table_df = pd.read_table(input_file, sep='\t', index_col=0).transpose().fillna(0) + + # If user is providing a reverse table, turn back + if transpose: + table_df = table_df.transpose() + + # Remove header on rows + table_df.index.name = None + + # Replace text on sample labels + if sample_replace: + print_log("Replacing sample label values:") + before_replace = table_df.head(1).index + #get index as series to use replace method + new_index = table_df.reset_index()["index"].replace(regex=dict(zip(sample_replace[::2], sample_replace[1::2]))) + table_df.set_index(new_index, inplace=True) + for b, a in zip(before_replace, table_df.head(1).index): + print_log(" " + b + " -> " + a) + print_log(" ...") + + # Sum total before split unassigned or filter + total = table_df.sum(axis=1) + + # unique unassigned/unclassified for table + # Separate unassigned counts column from main data frame + unassigned = pd.Series(0, index=table_df.index) + if unassigned_header: + for header in unassigned_header: + if header in table_df.columns: + if isinstance(table_df[header], pd.DataFrame): + # Sum in case there are several equally named headers + unassigned += table_df[header].sum(axis=1) + else: + # return a pd.Series + unassigned += table_df[header] + table_df.drop(columns=header, inplace=True) + else: + print_log("'" + header + "' header not found") + + if unassigned.sum() == 0: + print_log("No unassigned entries defined") + + print_log("") + print_log("- Trimming table") + table_df = trim_table(table_df) + + # Filter based on the final table + unassigned = unassigned.reindex(table_df.index) + total = total.reindex(table_df.index) + + return table_df, total, unassigned + + +def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count, normalized): + + if min_count: + cnt = table_df.sum().sum() + if min_count < 1: + table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df + table_df = table_df[table_df_norm >= min_count].fillna(0) + elif min_count > 1: + table_df = table_df[table_df >= min_count].fillna(0) + print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --min-count " + str(min_count)) + + if max_count: + cnt = table_df.sum().sum() + if max_count < 1: + table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df + table_df = table_df[table_df_norm <= max_count].fillna(0) + elif max_count > 1: + table_df = table_df[table_df <= max_count].fillna(0) + print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --max-count " + str(max_count)) + + if min_frequency: + cnt = table_df.shape[1] + table_df_freq = table_df.gt(0).sum(axis=0) + if min_frequency < 1: + table_df_freq = table_df_freq / table_df.shape[0] + table_df = table_df.loc[:, table_df_freq >= min_frequency] + elif min_frequency > 1: + table_df = table_df.loc[:, table_df_freq >= min_frequency] + print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --min-frequency " + str(min_frequency)) + + if max_frequency: + cnt = table_df.shape[1] + table_df_freq = table_df.gt(0).sum(axis=0) + if max_frequency < 1: + table_df_freq = table_df_freq / table_df.shape[0] + table_df = table_df.loc[:, table_df_freq <= max_frequency] + elif max_frequency > 1: + table_df = table_df.loc[:, table_df_freq <= max_frequency] + print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --max-frequency " + str(max_frequency)) + + return table_df + + +def trim_table(table_df): + # Check for cols/rows with sum zero + zero_rows = table_df.sum(axis=1).eq(0) + if any(zero_rows): + table_df = table_df.loc[~zero_rows, :] + print_log(str(sum(zero_rows)) + " samples with only zero removed") + + zero_cols = table_df.sum(axis=0).eq(0) + if any(zero_cols): + table_df = table_df.loc[:, ~zero_cols] + print_log(str(sum(zero_cols)) + " observations with only zero removed") + + return table_df + + +def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): + from grimer.grimer import _debug + + # Transpose table (obseravations as index) and expand ranks in columns + ranks_df = table_df.T.index.str.split(level_separator, expand=True).to_frame(index=False) + + # For every pair of replace arguments + if obs_replace: + print_log("Replacing values:") + before_replace = ranks_df.dropna().head(1).values[0] + ranks_df.replace(regex=dict(zip(obs_replace[::2], obs_replace[1::2])), inplace=True) + for b, a in zip(before_replace, ranks_df.dropna().head(1).values[0]): + print_log(" " + b + " -> " + a) + print_log(" ...") + + # replace entirely space or empty with NaN + ranks_df = ranks_df.replace(r'^\s*$', np.nan, regex=True) + + # Set rank names, matching user defined or default + user_ranks = False + if len(ranks) == ranks_df.shape[1]: + parsed_ranks = {r: ranks[r] for r in range(ranks_df.shape[1])} + user_ranks = True + else: + print_log("Ranks provided (" + str(len(ranks)) + ") do not match file (" + str(ranks_df.shape[1]) + " levels). Using default named ranks.") + parsed_ranks = {r: "rank-" + str(r) for r in range(ranks_df.shape[1])} + ranks_df.rename(columns=parsed_ranks, inplace=True) + + # Update taxids + if tax: + unmatched_nodes = 0 + for i, r in parsed_ranks.items(): + rank_nodes = ranks_df[r].dropna().unique() + + # If there is at least one valid entry + if rank_nodes.any(): + # If user-provided ranks are matching, update nodes with rank + if user_ranks: + updated_nodes = {node: unode for (rank, node), unode in update_tax_nodes([(r, n) for n in rank_nodes], tax).items()} + else: + updated_nodes = update_tax_nodes(rank_nodes, tax) + + # Add nan to keep missing ranks (different than tax.undefined_node [None] which will keep the name) + updated_nodes[np.nan] = np.nan + ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] is not None else t) + del updated_nodes[np.nan] + + unmatched_nodes += list(updated_nodes.values()).count(tax.undefined_node) + + if unmatched_nodes: + print_log(str(unmatched_nodes) + " observations not found in taxonomy (but kept)") + + # Check unique lineage + for i, r in parsed_ranks.items(): + if i > 0: + lin_count = ranks_df.iloc[:, :i+1].drop_duplicates().groupby(r).count() + invalid = lin_count[(lin_count > 1).any(axis=1)].index.to_list() + if invalid: + print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r) + if _debug: + print_log(",".join(invalid) + " observations removed with invalid lineage at " + r) + # Set to NaN to keep shape of ranks_df + ranks_df.loc[ranks_df[r].isin(invalid), r] = np.nan + + ranked_tables = {} + for i, r in parsed_ranks.items(): + # ranks_df and table_df.T have the same shape + ranked_table_df = pd.concat([ranks_df[r], table_df.T.reset_index(drop=True)], axis=1) + ranked_tables[r] = ranked_table_df.groupby([r], dropna=True).sum().T + ranked_tables[r].columns.name = None + + lineage = ranks_df + return ranked_tables, lineage + + +def parse_single_table(table_df, ranks, tax, default_rank_name): + + # Update taxids + if tax: + updated_nodes = update_tax_nodes(table_df.columns, tax) + unmatched_nodes = list(updated_nodes.values()).count(tax.undefined_node) + if unmatched_nodes: + print_log(str(unmatched_nodes) + " observations not found in taxonomy") + for node, upd_node in updated_nodes.items(): + if upd_node is not None and upd_node != node: + # If updated node is a merge on an existing taxid, sum values + if upd_node in table_df: + table_df[upd_node] += table_df[node] + table_df.drop(columns=node, inplace=True) + print_log("Updated and merged taxonomic nodes: " + node + " -> " + upd_node) + else: + table_df.rename(columns={node: upd_node}, inplace=True) + print_log("Updated taxonomic node: " + node + " -> " + upd_node) + + # Generate ranks + ranked_tables = {} + for rank in ranks: + # Special case for "default" rank + if rank == default_rank_name: + ranked_tables[rank] = table_df + else: + taxid_parent_rank = {i: tax.parent_rank(tax.latest(i), rank) for i in table_df.columns} + rank_df = pd.DataFrame(index=table_df.index) + for taxid, parent_rank_taxid in taxid_parent_rank.items(): + if parent_rank_taxid is None: + #no_rank += 1 + continue + if parent_rank_taxid not in rank_df: + rank_df[parent_rank_taxid] = 0 + rank_df[parent_rank_taxid] += table_df[taxid] + + if not rank_df.empty: + ranked_tables[rank] = rank_df + + # Generate lineage + if tax: + lineage = pd.DataFrame(list(map(lambda t: tax.lineage(t, ranks=list(ranked_tables.keys())), table_df.columns)), columns=list(ranked_tables.keys())) + else: + lineage = pd.DataFrame() + + return ranked_tables, lineage + + +def transform_table(df, total_counts, transformation, replace_zero_value): + # Special case clr with one observation (result in zeros) + if transformation == "clr" and df.shape[1] == 1: + print_log("WARNING: using log instead of clr with one observation") + transformation = "log" + + if transformation == "log": + transformed_df = (df + replace_zero_value).apply(np.log10) + elif transformation == "clr": + transformed_df = pd.DataFrame(clr(df + replace_zero_value), index=df.index, columns=df.columns) + elif transformation == "norm": + transformed_df = df.divide(total_counts, axis=0) + replace_zero_value + else: + transformed_df = df + replace_zero_value + + return transformed_df + + +def update_tax_nodes(nodes, tax): + """ + nodes can be a list of strings: taxids or names or a list of tuples with (rank, taxid/name) + Return a dictionary mapping nodes and updated nodes (or None) + First look for id, if nothing found, lookup by unique name + """ + + updated_nodes = {} + for node in nodes: + if isinstance(node, tuple): + r = node[0] + n = node[1] + else: + r = None + n = node + + # Either returns same node, updated or tax.undefined_node (None) + updated_taxid = tax.latest(n) + if updated_taxid: + # Assign updated or same taxid + updated_nodes[node] = updated_taxid + else: + names = tax.search_name(n, rank=r, exact=True) + # Assign taxid if found unique name only + if names and len(names) == 1: + updated_nodes[node] = names[0] + else: + updated_nodes[node] = tax.undefined_node + + return updated_nodes + + +def run_decontam(cfg, table, metadata, control_samples): + decontam = None + if not cfg: + print("Configuration file not found. Skipping DECONTAM") + return None + + df_decontam = pd.DataFrame(index=table.samples, columns=["concentration", "controls"]) + cfg_decontam = cfg["external"]["decontam"] + tmp_output_prefix = "tmp_" + + # Collect metadata for DECONTAM (concentrations to use frequency and control for prevalence) + out_table = tmp_output_prefix + "table_counts.tsv" + out_concentration = tmp_output_prefix + "concentration_counts.tsv" + out_controls = tmp_output_prefix + "control_samples_list.txt" + if cfg_decontam["method"] in ["frequency", "combined"]: + out_concentration = tmp_output_prefix + "concentration_counts.tsv" + # Load frequency file, if provided + if "frequency_file" in cfg_decontam: + if os.path.isfile(cfg_decontam["frequency_file"]): + # Load concentrations from file and sort (reindex) based on table inputs + df_decontam["concentration"] = pd.read_table(cfg_decontam["frequency_file"], sep='\t', header=None, skiprows=0, index_col=0).reindex(table.samples) + # If any entry is unknown, input is incomplete + if df_decontam["concentration"].isnull().values.any(): + print_log("File " + cfg_decontam["frequency_file"] + " is incomplete (Missing: " + ",".join(df_decontam[df_decontam.isnull().any(axis=1)].index.to_list()) + ") Skipping DECONTAM.") + return None + else: + print_log("File " + cfg_decontam["frequency_file"] + " not found. Skipping DECONTAM.") + return None + elif "frequency_metadata" in cfg_decontam: + if cfg_decontam["frequency_metadata"] in metadata.get_col_headers(): + # Get concentrations from metadata + df_decontam["concentration"] = metadata.get_col(cfg_decontam["frequency_metadata"]) + else: + print_log("Could not find " + cfg_decontam["frequency_metadata"] + " in the metadata. Skipping DECONTAM.") + return None + elif not table.normalized: + # Use total from table + print_log("No concentration provided, using total counts as concentration (frequency for DECONTAM)") + df_decontam["concentration"] = table.total + else: + print_log("Cannot run DECONTAM without concentration and normalized values") + return None + # Print concentrations to file + df_decontam["concentration"].to_csv(out_concentration, sep="\t", header=False, index=True) + + if cfg_decontam["method"] in ["prevalence", "combined"]: + control_list = set() + if "prevalence_file" in cfg_decontam: + for file in cfg_decontam["prevalence_file"]: + if os.path.isfile(file): + # Load controls from file + control_list.update([line.rstrip() for line in open(file)]) + else: + print_log("File not found " + file) + elif "prevalence_metadata" in cfg_decontam: + for field, value in cfg_decontam["prevalence_metadata"].items(): + if field in metadata.get_col_headers(): + control_list.update(metadata.get_subset(field, value).index) + else: + print_log("Could not find " + field + " in the metadata.") + else: + # Use all samples passed as controls + for cs in control_samples.values(): + control_list.update(cs) + + # Select valid controls + df_decontam["controls"] = table.samples.isin(control_list) + + if df_decontam["controls"].any(): + print_log(str(df_decontam["controls"].sum()) + " valid control samples to be used by DECONTAM") + outf = open(out_controls, "w") + print("\n".join(df_decontam.index[df_decontam["controls"]]), file=outf) + outf.close() + else: + print("Could not find valid control entries. Skipping DECONTAM") + return None + + decontam = Decontam(df_decontam) + + # Run DECONTAM for each for each + for rank in table.ranks(): + + if len(table.observations(rank)) == 1: + decontam.add_rank_empty(rank, table.observations(rank)) + else: + # normalize and write temporary table for each rank + if not table.normalized: + transform_table(table.data[rank], table.total[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) + else: + table.data[rank].to_csv(out_table, sep="\t", header=True, index=True) + + cmd = " ".join(["scripts/run_decontam.R", + "--resout " + tmp_output_prefix + "decontam_out.tsv", + "--modout " + tmp_output_prefix + "decontam_mod.tsv", + "--counts " + out_table, + "--concentrations " + out_concentration if cfg_decontam["method"] in ["frequency", "combined"] else "", + "--controls " + out_controls if cfg_decontam["method"] in ["prevalence", "combined"] else "", + "--method " + cfg_decontam["method"], + "--threshold " + str(cfg_decontam["threshold"])]) + stdout, stderr = run_cmd(cmd) + + decontam.add_rank_results(rank, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv") + + for file in [out_table, out_concentration, out_controls, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv"]: + if os.path.isfile(file): + os.remove(file) + return decontam + + +def run_hclustering(table, linkage_methods, linkage_metrics, transformation, skip_dendrogram, optimal_ordering): + hcluster = {} + dendro = {} + + for rank in table.ranks(): + + # Get .values of transform, numpy array + matrix = transform_table(table.data[rank], table.total, transformation, table.zerorep).values + + hcluster[rank] = {} + dendro[rank] = {} + for method in linkage_methods: + hcluster[rank][method] = {} + dendro[rank][method] = {} + for metric in linkage_metrics: + hcluster[rank][method][metric] = {} + hcluster[rank][method][metric]["x"] = {} + hcluster[rank][method][metric]["y"] = {} + + #H.clustering, returning dendrogram + # Only one observation does not cluster + if matrix.shape[1] > 1: + x = sch.dendrogram(sch.linkage(matrix.transpose(), method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True) + hcluster[rank][method][metric]["x"]["index"] = table.observations(rank)[x["leaves"]].to_list() + else: + hcluster[rank][method][metric]["x"]["index"] = table.observations(rank).to_list() + + # Only one samples does not cluster + if matrix.shape[0] > 1: + y = sch.dendrogram(sch.linkage(matrix, method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True) + hcluster[rank][method][metric]["y"]["index"] = table.samples[y["leaves"]].to_list() + else: + hcluster[rank][method][metric]["y"]["index"] = table.samples.to_list() + + if not skip_dendrogram: + dendro[rank][method][metric] = {} + dendro[rank][method][metric]["y"] = {} + dendro[rank][method][metric]["x"] = {} + + # Save dendrogram values and colors + xs, ys, colors = [[]] * 3 + if matrix.shape[1] > 1: + xs, ys, colors = dendro_lines_color(x, "x") + dendro[rank][method][metric]["x"]["xs"] = xs + dendro[rank][method][metric]["x"]["ys"] = ys + dendro[rank][method][metric]["x"]["colors"] = colors + if matrix.shape[0] > 1: + xs, ys, colors = dendro_lines_color(y, "y") + dendro[rank][method][metric]["y"]["xs"] = xs + dendro[rank][method][metric]["y"]["ys"] = ys + dendro[rank][method][metric]["y"]["colors"] = colors + + return hcluster, dendro + + +def dendro_lines_color(dendro, axis): + icoord = pd.DataFrame(dendro["icoord"]) + icoord = icoord * ((len(dendro["icoord"]) + 0.5) / icoord.max().max()) + icoord = icoord.values.tolist() + if axis == "y": + dcoord = dendro["dcoord"] + else: + dcoord = [[-j for j in i] for i in dendro['dcoord']] + + color_list = dendro["color_list"] + unique_colors = sorted(set(color_list)) + cp = make_color_palette(len(unique_colors)) + colors = [cp[unique_colors.index(colorid)] for colorid in color_list] + + if axis == "y": + return dcoord, icoord, colors + else: + return icoord, dcoord, colors + + +def include_scripts(scripts): + # Insert global js functions and css and return template + template = "{% block postamble %}" + for file, t in scripts.items(): + with open(file, 'r') as file: + template += "<" + t + ">" + template += "".join(file.readlines()) + template += "" + template += "{% endblock %}" + return template + +def pairwise_vlr(mat): + cov = np.cov(mat.T, ddof=1) + diagonal = np.diagonal(cov) + return -2 * cov + diagonal[:, np.newaxis] + diagonal + + +def pairwise_rho(mat): + variances = np.var(mat, axis=0, ddof=1) + return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) + + +def format_js_toString(val): + # Transform numeric value to float and string to match toString + return str(float(val)) if isinstance(val, (int, float)) else str(val) + + +def make_color_palette(n_colors, linear: bool=False, palette: dict=None): + if isinstance(palette, dict) and n_colors <= max(palette.keys()): + # Special case for 1 and 2 (not in palettes) + palette = palette[3 if n_colors < 3 else n_colors] + + if linear or n_colors > 20: + if not palette: + palette = Turbo256 + if n_colors <= 256: + return linear_palette(palette, n_colors) + else: + # Repeat colors + return [palette[int(i * 256.0 / n_colors)] for i in range(n_colors)] + else: + # Select color palette based on number of requested colors + # Return the closest palette with most distinc set of colors + if not palette: + if n_colors <= 8: + palette = Colorblind[8] + elif n_colors <= 10: + palette = Category10[10] + elif n_colors <= 20: + palette = Category20[20] + else: + palette = Turbo256 + + return palette[:n_colors] + +def run_cmd(cmd, print_stderr: bool=False, exit_on_error: bool=True): + errcode = 0 + stdout = "" + stderr = "" + try: + process = subprocess.Popen(shlex.split(cmd), + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + # wait for the process to terminate + stdout, stderr = process.communicate() + errcode = process.returncode + if exit_on_error and errcode != 0: + raise Exception() + if print_stderr and stderr: + print_log(stderr) + + except Exception as e: + print_log('The following command failed to run:\n' + cmd) + print_log(str(e)) + print_log("Error code: " + str(errcode)) + print_log("Out: ") + if stdout: + print_log(stdout) + print_log("Error: ") + if stderr: + print_log(stderr) + sys.exit(errcode) + + return stdout, stderr + + +def print_log(text): + sys.stderr.write(text + "\n") + sys.stderr.flush() + + +def print_df(df, name: str=None): + from grimer.grimer import _debug + if _debug: + print(name) + if isinstance(df, dict): + if df: + print(len(df.keys()), "keys:", list(df.keys())[0], "...", list(df.keys())[-1]) + #print(list(df.values())[0], "...", list(df.values())[-1]) + else: + #print(df.columns) + print(df.head()) + print(df.shape) + print("size:", sys.getsizeof(df)) + print("-----------------------------------------------") + + +def print_logo_cli(version): + print_log("==================") + print_log(" ╔═╗╦═╗╦╔╦╗╔═╗╦═╗ ") + print_log(" ║ ╦╠╦╝║║║║║╣ ╠╦╝ ") + print_log(" ╚═╝╩╚═╩╩ ╩╚═╝╩╚═ ") + print_log(" v" + version) + print_log("==================") diff --git a/grimer/grimer.py b/grimer/grimer.py index 1484f2c..ce3f4a6 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -3,21 +3,16 @@ #General import argparse -import yaml +import sys #Internal -from grimer.table import Table -from grimer.metadata import Metadata -from grimer.mgnify import MGnify from grimer.callbacks import * from grimer.cds import * from grimer.config import Config from grimer.layout import * from grimer.plots import * -from grimer.utils import * +from grimer.func import * -# MultiTax -from multitax import * #Bokeh from bokeh.io import save @@ -25,194 +20,83 @@ def main(argv=sys.argv[1:]): - + """ + GRIMER steps + 1) Load data/analysis: parse configuration, load files and run analysis into data objects + e.g. args.input_file to Table() and decontam + 2) Generata data sources: Convert objects and analysis int cds/dict + e.g. table to cds_m_obstable + 3) Plot figures and elements based on cds/dict (and some objects) + e.g cds_m_obstable to ele["obstable"]["fig"] + 4) Link javascript callbacks between elements and cds/dict + 5) Put elements into layout and generate report + """ + + # Parse CLI arguments args = Config(argv) print_logo_cli(Config.version) - + # Setup global _debug variable to be used by other files with #from grimer.grimer import _debug global _debug _debug = args.debug - # Config file - cfg = {} - if args.config: - with open(args.config, 'r') as file: - cfg = yaml.safe_load(file) - - # Taxonomy + # 1) Load data/analysis + cfg = None tax = None - if args.tax: - if args.tax_files: - print_log("- Parsing taxonomy") - else: - print_log("- Downloading and parsing taxonomy") - print_log(args.tax) - if args.tax == "ncbi": - tax = NcbiTx(files=args.tax_files, extended_names=True) - elif args.tax == "gtdb": - tax = GtdbTx(files=args.tax_files) - elif args.tax == "silva": - tax = SilvaTx(files=args.tax_files) - elif args.tax == "greengenes": - tax = GreengenesTx(files=args.tax_files) - elif args.tax == "ott": - tax = OttTx(files=args.tax_files, extended_names=True) - else: - print_log("- No taxonomy set") - print_log("") - - # Ranks - if not args.ranks: - args.ranks = [Config.default_rank_name] - - # Table of counts - print_log("- Parsing table") - - # Specific params if biom file is provided - if args.input_file.endswith(".biom"): - args.level_separator = ";" - args.transpose = True - - # Read and return full table with separated total and unassigned counts (sharing same index) - table_df, total, unassigned = parse_input_table(args.input_file, args.unassigned_header, args.transpose, args.sample_replace) - - # Define if table is already normalized (0-100) or has count data - if args.values == "count": - normalized = False - elif args.values == "normalized": - normalized = True - elif (table_df.sum(axis=1).round() == 100).all() or (table_df % 1 != 0).any().any(): - normalized = True - else: - normalized = False - if normalized: - print_log("- Table parsed with normalized values") + table = None + metadata = None + references = None + controls = None + control_samples = None + hcluster = None + dendro = None + corr = None + + print_log("- Parsing configuration file") + cfg = parse_config_file(args.config) + + print_log("- Parsing taxonomy") + tax = parse_taxonomy(args.tax, args.tax_files) + + print_log("- Parsing input table") + table = parse_table(args, tax) - # Split table into ranks. Ranks are either in the headers in multi level tables or will be created for a one level table - if args.level_separator: - ranked_tables, lineage = parse_multi_table(table_df, args.ranks, tax, args.level_separator, args.obs_replace) - else: - ranked_tables, lineage = parse_single_table(table_df, args.ranks, tax, Config.default_rank_name) - - if not ranked_tables: - print_log("Could not parse input table") - return 1 - - table = Table(table_df.index, total, unassigned, lineage, normalized) - - print_log("") - print_log("Total valid samples: " + str(len(table.samples))) - # Check for long sample headers, break some plots - long_sample_headers = [h for h in table_df.index if len(h) > 70] - if long_sample_headers: - print_log("Long sample labels/headers detected, plots may break: ") - print_log("\n".join(long_sample_headers)) - print_log("") - - for r, t in ranked_tables.items(): - print_log("--- " + r + " ---") - filtered_trimmed_t = trim_table(filter_input_table(t, total, args.min_frequency, args.max_frequency, args.min_count, args.max_count, normalized)) - if t.empty: - print_log("No valid entries, skipping") - else: - # Trim table for empty zeros rows/cols - table.add_rank(r, filtered_trimmed_t) - print_log("Total valid observations: " + str(len(table.observations(r)))) - - print_log("") - if not normalized: - print_log("Total assigned (counts): " + str(table.total.sum() - table.unassigned.sum())) - print_log("Total unassigned (counts): " + str(table.unassigned.sum())) - print_log("") - - # Zero replacement - try: - replace_zero_value = table_df[table_df.gt(0)].min().min() / int(args.replace_zeros) - except: - replace_zero_value = float(args.replace_zeros) - if replace_zero_value == 1 and args.transformation == "log": - replace_zero_value = 0.999999 # Do not allow value 1 using log - - # Metadata - max_metadata_cols = args.metadata_cols print_log("- Parsing metadata") - metadata = None - if args.metadata: - metadata = Metadata(metadata_file=args.metadata, samples=table.samples.to_list()) - elif args.input_file.endswith(".biom"): - try: - biom_in = biom.load_table(args.input_file) - if biom_in.metadata() is not None: - metadata = Metadata(metadata_table=biom_in.metadata_to_dataframe(axis="sample"), samples=table.samples.to_list()) - except: - metadata = None - print_log("Error parsing metadata from BIOM file") - - if metadata is None or metadata.data.empty: - metadata = None - print_log("No valid metadata") - else: - print_log("Samples: " + str(metadata.data.shape[0])) - print_log("Numeric Fields: " + str(metadata.get_data("numeric").shape[1])) - print_log("Categorical Fields: " + str(metadata.get_data("categorical").shape[1])) - if len(metadata.get_col_headers()) < args.metadata_cols: - max_metadata_cols = len(metadata.get_col_headers()) - print_log("") - - # References (only possible with ncbi identifiers) - references = {} - if "references" in cfg and args.tax == "ncbi": - print_log("- Parsing references") - references = parse_references(cfg, tax, table.ranks()) - print_log("") - - controls, control_samples = [{}, {}] - if "controls" in cfg: - print_log("- Parsing controls") - # Controls - controls, control_samples = parse_controls(cfg, table) - print_log("") - - # Run and load decontam results - decontam = None - if args.decontam: - print_log("- Running DECONTAM") - decontam = run_decontam(cfg, table, metadata, control_samples, normalized) - print_log("") - - # Mgnify - mgnify = None - if args.mgnify: - if cfg and "mgnify" in cfg["external"]: - print_log("- Parsing MGNify") - mgnify = MGnify(cfg["external"]["mgnify"], ranks=table.ranks() if args.ranks != [Config.default_rank_name] else []) - if tax: - mgnify.update_taxids(update_tax_nodes([tuple(x) for x in mgnify.data[["rank", "taxa"]].to_numpy()], tax)) - print_log("") - else: - print("Configuration file not found. Skipping MGnify") - print_log("") - - # Hiearchical clustering + metadata = parse_metadata(args, table) + + print_log("- Parsing references") + references = parse_references(cfg, tax, args.tax, table.ranks()) + + print_log("- Parsing controls") + controls, control_samples = parse_controls(cfg, table) + + print_log("- Parsing MGnify database") + mgnify = parse_mgnify(args.mgnify, cfg, tax, table.ranks()) + + print_log("- Running DECONTAM") + decontam = run_decontam(cfg, table, metadata, control_samples) + print_log("- Running hiearchical clustering") - hcluster, dendro = run_hclustering(table, args.linkage_methods, args.linkage_metrics, args.transformation, replace_zero_value, args.skip_dendrogram, args.optimal_ordering) - print_log("") + hcluster, dendro = run_hclustering(table, args.linkage_methods, args.linkage_metrics, args.transformation, args.skip_dendrogram, args.optimal_ordering) - # save max/min values to control ranges - max_total_count = table.total.max() - min_obs_perc = min([table.get_counts_perc(rank)[table.get_counts_perc(rank) > 0].min().min() for rank in table.ranks()]) + print_log("- Running correlation") + corr = run_correlation(table, args.top_obs_corr) - print_log("- Generating GRIMER report") - ############ cds (ColumnDataSource) and dict containers: data structures loaded and parsed by bokehjs - ############ "cds" for matrix like dataframes with fixed column sizes - ############ "dict" for variable column sizes - ############ _p_ : plot -> direct source of figures - ############ _d_ : data -> auxiliar containers to be used/shared among plots - ############ usually by copying and/or transforming values into a _p_ container + # 2) Generata data sources: + # cds (ColumnDataSource) and dict containers: data structures loaded and parsed by bokehjs + # "cds" for matrix like dataframes with fixed column sizes + # "dict" for variable column sizes + # _p_ : plot -> direct source of figures either pre-loaded or empty + # _d_ : data -> auxiliar containers to be used/shared among plots + # usually by copying and/or transforming values into a _p_ container + # _m_ : mixed -> contain both plot and data properties - # _p_ + print_log("- Generating data sources") + + # _m_ # df: index (unique observations), col|..., tax|..., aux|ref - # this cds an exeption and contains data to plot (col|) and auxiliary data (tax|) - cds_p_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam, normalized) + cds_m_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam) + + # _p_ # df: index (unique sample-ids), aux|..., bar|..., tax|... cds_p_samplebars = generate_cds_samplebars(table) # stacked: index (repeated observations), rank, ref, direct, parent @@ -224,23 +108,21 @@ def main(argv=sys.argv[1:]): # stacked: index (taxa, level, lineage), count, perc cds_p_mgnify = generate_cds_mgnify(mgnify, table, tax) if mgnify else None # stacked: index (repeated sample-ids), obs, rank, ov, tv - cds_p_heatmap = generate_cds_heatmap(table, args.transformation, replace_zero_value, args.show_zeros) - # matrix: index (unique sample-ids), md0, md1, ..., md(max_metadata_cols) -> (metadata field, metadata values) - cds_p_metadata = generate_cds_plot_metadata(metadata, max_metadata_cols) if metadata else None + cds_p_heatmap = generate_cds_heatmap(table, args.transformation, args.show_zeros) + # matrix: index (unique sample-ids), md0, md1, ..., md(args.metadata_cols) -> (metadata field, metadata values) + cds_p_metadata = generate_cds_plot_metadata(metadata, args.metadata_cols) if metadata else None # stacked: index (repeated observations), rank, annot cds_p_annotations = generate_cds_annotations(table, references, controls, decontam, control_samples) # empty matrix {"x": [], "y": [], "c": []} cds_p_dendro_x, cds_p_dendro_y = generate_cds_plot_dendro() if not args.skip_dendrogram else [None, None] # stacked: index (repeated observations), other observation, rank, rho - cds_p_correlation = generate_cds_correlation(table, args.top_obs_corr, replace_zero_value) + cds_p_correlation = generate_cds_correlation(table, corr) # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors cds_p_obsbars = generate_cds_obsbars(table, args.top_obs_bars) # df: index (unique sample-ids), col|... - cds_p_sampletable = generate_cds_sampletable(table, normalized) + cds_p_sampletable = generate_cds_sampletable(table) # _d_ - # dict: {rank: {obs: {sample: count}}} - dict_d_sampleobs = generate_dict_sampleobs(table) # df: index (unique sample-ids), aux|..., cnt|..., cds_d_samples = generate_cds_samples(table, references, controls, decontam) # matrix: index (unique sample-ids) x columns (metadata fields) -> metadata values @@ -259,6 +141,8 @@ def main(argv=sys.argv[1:]): dict_d_topobs = generate_dict_topobs(table, args.top_obs_bars) # {taxid: {source: {desc: [refs]}} dict_d_refs = generate_dict_refs(table, references) + # dict: {rank: {obs: {sample: count}}} + dict_d_sampleobs = generate_dict_sampleobs(table) ############ PLOT ELEMENTS (Figures, Widgets, ...) ############ "fig": main figure @@ -270,12 +154,16 @@ def main(argv=sys.argv[1:]): sizes["overview_top_panel_width_left"] = 250 sizes["overview_top_panel_width_right"] = 450 + # Elements to plot + # ele[name]["fig"] -> main figure/element + # ele[name]["filter"] -> filter to the figure + # ele[name]["wid"][widget1] -> widgets to the figure ele = {} # obstable ele["obstable"] = {} - ele["obstable"]["fig"], ele["obstable"]["widgets_filter"] = plot_obstable(sizes, cds_p_obstable, table.ranks(), references.keys(), controls.keys()) - ele["obstable"]["wid"] = plot_obstable_widgets(sizes, dict_d_taxname, max(cds_p_obstable.data["col|total_counts"])) + ele["obstable"]["fig"], ele["obstable"]["filter"] = plot_obstable(sizes, cds_m_obstable, table.ranks(), references, controls) + ele["obstable"]["wid"] = plot_obstable_widgets(sizes, dict_d_taxname, max(cds_m_obstable.data["col|total_counts"])) # infopanel ele["infopanel"] = {} @@ -301,15 +189,15 @@ def main(argv=sys.argv[1:]): ele["decontam"] = {} ele["decontam"]["wid"] = {} if decontam: - ele["decontam"]["fig"] = plot_decontam(sizes, cds_p_decontam, cds_p_decontam_models, min_obs_perc) + ele["decontam"]["fig"] = plot_decontam(sizes, cds_p_decontam, cds_p_decontam_models, table.get_min_valid_count_perc()) else: ele["decontam"]["fig"] = None ele["decontam"]["wid"] = plot_decontam_widgets(sizes) # samplebars ele["samplebars"] = {} - ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, max_total_count, table.ranks(), normalized) - ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, list(references.keys()), list(controls.keys()), decontam, normalized) + ele["samplebars"]["fig"], ele["samplebars"]["legend_obs"], ele["samplebars"]["legend_bars"] = plot_samplebars(cds_p_samplebars, table) + ele["samplebars"]["wid"] = plot_samplebars_widgets(table.ranks(), metadata, references, controls, decontam, table.normalized) # sampletable ele["sampletable"] = {} @@ -320,7 +208,7 @@ def main(argv=sys.argv[1:]): tools_heatmap = "hover,save,box_zoom,reset,crosshair,box_select" ele["heatmap"] = {} ele["heatmap"]["fig"] = plot_heatmap(table, cds_p_heatmap, tools_heatmap, args.transformation, dict_d_taxname) - ele["heatmap"]["wid"] = plot_heatmap_widgets(table.ranks(), args.linkage_methods, args.linkage_metrics, list(references.keys()), list(controls.keys()), metadata, decontam) + ele["heatmap"]["wid"] = plot_heatmap_widgets(table.ranks(), args.linkage_methods, args.linkage_metrics, references, controls, metadata, decontam) # metadata (heatmap) ele["metadata"] = {} @@ -351,7 +239,7 @@ def main(argv=sys.argv[1:]): # correlation ele["correlation"] = {} - ele["correlation"]["fig"], ele["correlation"]["rho_filter"] = plot_correlation(cds_p_correlation, table.ranks(), dict_d_taxname) + ele["correlation"]["fig"], ele["correlation"]["filter"] = plot_correlation(cds_p_correlation, table.ranks(), dict_d_taxname) ele["correlation"]["wid"] = plot_correlation_widgets(table.ranks(), args.top_obs_corr) # obsbars @@ -361,10 +249,10 @@ def main(argv=sys.argv[1:]): ############ JAVASCRIPT LINKING - link_obstable_filter(ele, cds_p_obstable, table.ranks()) + link_obstable_filter(ele, cds_m_obstable, table.ranks()) link_obstable_samplebars(ele, - cds_p_obstable, + cds_m_obstable, cds_p_samplebars, cds_d_samples, dict_d_sampleobs, @@ -374,8 +262,8 @@ def main(argv=sys.argv[1:]): cds_d_decontam, cds_p_references, table.ranks(), - min_obs_perc, - max_total_count, + table.get_min_valid_count_perc(), + table.get_total().max(), cds_p_mgnify, dict_d_refs, dict_d_taxname) @@ -391,12 +279,12 @@ def main(argv=sys.argv[1:]): dict_d_dedro_x, dict_d_dedro_y, cds_p_annotations, - cds_p_obstable, + cds_m_obstable, cds_p_heatmap, table.ranks(), dict_d_taxname) - link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols) + link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, args.metadata_cols) link_correlation_widgets(ele, cds_p_correlation) diff --git a/grimer/metadata.py b/grimer/metadata.py index 0b3abde..8fa3359 100644 --- a/grimer/metadata.py +++ b/grimer/metadata.py @@ -1,6 +1,5 @@ import pandas as pd from pandas.api.types import is_numeric_dtype -from grimer.utils import print_log class Metadata: diff --git a/grimer/plots.py b/grimer/plots.py index 2fe8c98..f5ed97d 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -8,14 +8,14 @@ from bokeh.plotting import figure from bokeh.transform import cumsum, factor_cmap, transform -from grimer.utils import format_js_toString, make_color_palette +from grimer.func import format_js_toString, make_color_palette -def plot_samplebars(cds_p_samplebars, max_total_count, ranks, normalized): +def plot_samplebars(cds_p_samplebars, table): # Bar plots has 3 main stacks: selection, others, unassigned # stacks can be annotated with references and controls samplebars_fig = figure(x_range=FactorRange(factors=cds_p_samplebars.data["aux|factors"]), - y_range=Range1d(start=0, end=max_total_count), + y_range=Range1d(start=0, end=table.get_total().max()), plot_height=400, sizing_mode="stretch_width", tools="box_zoom,reset,save") @@ -42,9 +42,9 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks, normalized): samplebars_fig.add_layout(LinearAxis(y_range_name="obs"), 'right') # Plot obs ranks - obs_palette = make_color_palette(len(ranks), palette=Dark2) + obs_palette = make_color_palette(len(table.ranks()), palette=Dark2) legend_obs_items = [] - for i, rank in enumerate(ranks): + for i, rank in enumerate(table.ranks()): ren = samplebars_fig.scatter(x="aux|factors", y="tax|" + rank, y_range_name="obs", name="tax|" + rank, # to work with hover properly @@ -55,7 +55,7 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks, normalized): legend_obs_items.append((rank, [ren])) # Legend counts (vbars) - legend_bars_items = [(f, [vbar_ren[i]]) for i, f in enumerate([ranks[0] + "|assigned"] + fixed_bar_options[1:])] + legend_bars_items = [(f, [vbar_ren[i]]) for i, f in enumerate([table.ranks()[0] + "|assigned"] + fixed_bar_options[1:])] legend_bars = Legend(items=legend_bars_items) legend_bars.margin = 0 legend_bars.border_line_width = 0 @@ -89,7 +89,7 @@ def plot_samplebars(cds_p_samplebars, max_total_count, ranks, normalized): samplebars_fig.xaxis.subgroup_label_orientation = "vertical" samplebars_fig.xaxis.axis_label = "samples" - samplebars_fig.yaxis[0].axis_label = "# counts" if not normalized else "% counts" + samplebars_fig.yaxis[0].axis_label = "# counts" if not table.normalized else "% counts" samplebars_fig.yaxis[1].axis_label = "% observations" samplebars_fig.yaxis[1].axis_label_text_color = "#606c38" @@ -232,13 +232,15 @@ def plot_obsbars_widgets(ranks, metadata, dict_d_topobs, dict_d_taxname, top_obs "help_button": help_button(title="Observation bars", text=help_text)} -def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, decontam, normalized): +def plot_samplebars_widgets(ranks, metadata, references, controls, decontam, normalized): annotbar_rank_select = Select(title="Annotate bars at rank:", value=ranks[0], options=[r for r in ranks]) annotbar_options = {} annotbar_options["Default"] = ["assigned"] - annotbar_options["References"] = [r for r in reference_names] - annotbar_options["Controls"] = [c for c in control_names] + if references is not None: + annotbar_options["References"] = [r for r in references.keys()] + if controls is not None: + annotbar_options["Controls"] = [c for c in controls.keys()] if decontam: annotbar_options["Decontam"] = ["decontam"] annotbar_select = Select(title="Annotate bars by:", value="assigned", options=annotbar_options) @@ -295,7 +297,7 @@ def plot_samplebars_widgets(ranks, metadata, reference_names, control_names, dec "help_button": help_button(title="Sample bars", text=help_text)} -def plot_obstable(sizes, cds_p_obstable, ranks, reference_names, control_names): +def plot_obstable(sizes, cds_m_obstable, ranks, references, controls): # General filter for widgets widgets_filter = IndexFilter() @@ -304,7 +306,7 @@ def plot_obstable(sizes, cds_p_obstable, ranks, reference_names, control_names): # Create table with view for each rank for rank in ranks: rank_filter = GroupFilter(column_name='col|rank', group=rank) - cds_view = CDSView(source=cds_p_obstable, filters=[rank_filter, widgets_filter]) + cds_view = CDSView(source=cds_m_obstable, filters=[rank_filter, widgets_filter]) table_cols = [] table_cols.append(TableColumn(field="col|name", title="Name")) @@ -312,13 +314,16 @@ def plot_obstable(sizes, cds_p_obstable, ranks, reference_names, control_names): table_cols.append(TableColumn(field="col|counts_perc_avg", title="Avg. counts/sample", default_sort="descending", formatter=NumberFormatter(format="0.00%"))) table_cols.append(TableColumn(field="col|total_counts", title="Total counts", default_sort="descending")) - for ctrl_name in control_names: - table_cols.append(TableColumn(field="col|" + ctrl_name, title="(F) " + ctrl_name, default_sort="descending", formatter=NumberFormatter(format="0.00%"))) + if references is not None: + for ref_name in references.keys(): + table_cols.append(TableColumn(field="col|" + ref_name, title=ref_name, default_sort="descending")) - for ref_name in reference_names: - table_cols.append(TableColumn(field="col|" + ref_name, title=ref_name, default_sort="descending")) + if controls is not None: + for ctrl_name in controls.keys(): + table_cols.append(TableColumn(field="col|" + ctrl_name, title="(F) " + ctrl_name, default_sort="descending", formatter=NumberFormatter(format="0.00%"))) - if "col|decontam" in cds_p_obstable.data: + + if "col|decontam" in cds_m_obstable.data: table_cols.append(TableColumn(field="col|decontam", title="DECONTAM", default_sort="descending")) datatable = DataTable(height=sizes["overview_top_panel_height"], @@ -329,7 +334,7 @@ def plot_obstable(sizes, cds_p_obstable, ranks, reference_names, control_names): #selectable="checkbox", frozen_columns=1, columns=table_cols, - source=cds_p_obstable, + source=cds_m_obstable, view=cds_view) obstable_tabs.append(Panel(child=datatable, title=rank)) @@ -573,7 +578,8 @@ def plot_references(sizes, table, cds_p_references, dict_d_taxname): def plot_references_widgets(sizes, references): - references_select = Select(value=list(references.keys())[0] if references else None, width=sizes["overview_top_panel_width_right"] - 70, options=list(references.keys())) + ref_names = list(references.keys()) if references is not None else [] + references_select = Select(value=ref_names[0] if ref_names else None, width=sizes["overview_top_panel_width_right"] - 70, options=ref_names) help_text = """ Plot of number of occurences of provided references for each observation and its lineage. @@ -717,7 +723,7 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax return heatmap -def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_names, controls_names, metadata, decontam): +def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, references, controls, metadata, decontam): rank_select = Select(title="Taxonomic rank:", value=ranks[0], options=ranks) @@ -733,9 +739,10 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, reference_name x_sort_options = {} x_sort_options["Default"] = [("none", "none"), ("counts", "counts"), ("observations", "observations")] - x_sort_options["References"] = [("annot|" + r, r) for r in reference_names] - if controls_names: - x_sort_options["Controls"] = [("annot|" + c, c) for c in controls_names] + if references is not None: + x_sort_options["References"] = [("annot|" + r, r) for r in references.keys()] + if controls is not None: + x_sort_options["Controls"] = [("annot|" + c, c) for c in controls.keys()] if decontam: x_sort_options["DECONTAM"] = [("annot|decontam", "decontam")] diff --git a/grimer/table.py b/grimer/table.py index e9ab0b5..2206533 100644 --- a/grimer/table.py +++ b/grimer/table.py @@ -2,7 +2,7 @@ class Table: - def __init__(self, samples, total, unassigned, lineage, normalized): + def __init__(self, samples, total, unassigned, lineage, normalized, zerorep): # Ordered dict to keep rank insert order self.data = OrderedDict() self.lineage = lineage @@ -10,6 +10,7 @@ def __init__(self, samples, total, unassigned, lineage, normalized): self.total = total self.unassigned = unassigned self.normalized = normalized + self.zerorep = zerorep def __repr__(self): args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] @@ -24,6 +25,9 @@ def observations(self, rank): def ranks(self): return list(self.data.keys()) + def get_min_valid_count_perc(self): + return min([self.get_counts_perc(rank)[self.get_counts_perc(rank) > 0].min().min() for rank in self.ranks()]) + def get_total(self): return self.total From fe5392cb8a1b545404c05dc89dc94fe2cc6ef3cd Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Fri, 4 Mar 2022 17:00:18 +0100 Subject: [PATCH 45/50] better code org, error msg --- grimer/cds.py | 44 ++-- grimer/config.py | 2 +- grimer/func.py | 176 +++++++------ grimer/grimer.py | 80 +++--- grimer/utils.py | 654 ----------------------------------------------- 5 files changed, 158 insertions(+), 798 deletions(-) delete mode 100644 grimer/utils.py diff --git a/grimer/cds.py b/grimer/cds.py index 775de2e..23812cf 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -10,7 +10,7 @@ from bokeh.models import ColumnDataSource -def generate_dict_taxname(tax, taxids): +def dict_taxname(tax, taxids): """ mapping taxids to names (or names to names if taxid is not used) @@ -22,7 +22,7 @@ def generate_dict_taxname(tax, taxids): return id_name -def generate_cds_plot_references(table, tax, references): +def cds_plot_references(table, tax, references): # Stacked list of references, accounting for lineage matches # index -> observations (repeated) # columns -> "rank", "ref", "direct", "parent" @@ -43,7 +43,7 @@ def generate_cds_plot_references(table, tax, references): return ColumnDataSource(df_references) -def generate_cds_annotations(table, references, controls, decontam, control_samples): +def cds_annotations(table, references, controls, decontam, control_samples): # Stacked matrix of true annotations (omit false) # index -> taxids # columns -> rank, annot @@ -102,7 +102,7 @@ def generate_cds_annotations(table, references, controls, decontam, control_samp return ColumnDataSource(df_annotations) -def generate_cds_obstable(table, tax, references, controls, control_samples, decontam): +def cds_obstable(table, tax, references, controls, control_samples, decontam): # index unique taxids # col|... values to plot to columns in the datatable # tax|... auxiliary lineage of taxa entries @@ -160,7 +160,7 @@ def generate_cds_obstable(table, tax, references, controls, control_samples, dec return ColumnDataSource(df_obstable) -def generate_cds_sampletable(table): +def cds_sampletable(table): # index unique sample-ids # col|... values to plot to columns in the datatable @@ -181,7 +181,7 @@ def generate_cds_sampletable(table): return ColumnDataSource(df_sampletable) -def generate_cds_samplebars(table): +def cds_samplebars(table): # index unique sample-ids # aux| auxiliary values (not plotted) # bar| values plotted as bars (sample counts) @@ -203,7 +203,7 @@ def generate_cds_samplebars(table): return ColumnDataSource(df_bars) -def generate_cds_samples(table, references, controls, decontam): +def cds_samples(table, references, controls, decontam): # index unique sample-ids # aux| auxiliary values (not plotted) # cnt| count values to be copied/traansformed to bars @@ -247,7 +247,7 @@ def generate_cds_samples(table, references, controls, decontam): return ColumnDataSource(df_samples) -def generate_cds_metadata(metadata): +def cds_metadata(metadata): # index -> sample-ids # columns -> metadata fields # values -> metadata values @@ -256,7 +256,7 @@ def generate_cds_metadata(metadata): return ColumnDataSource(df_md) -def generate_cds_plot_metadata(metadata, max_metadata_cols): +def cds_plot_metadata(metadata, max_metadata_cols): # index (unique sample-ids) # md0, md1, ..., md(max_metadata_cols) # values (metadata field, metadata values) @@ -272,7 +272,7 @@ def generate_cds_plot_metadata(metadata, max_metadata_cols): return ColumnDataSource(df_plot_md) -def generate_cds_plot_decontam(decontam): +def cds_plot_decontam(decontam): # index unique sample-ids # concentrations from decontam inputs # controls from decontam inputs @@ -284,7 +284,7 @@ def generate_cds_plot_decontam(decontam): return ColumnDataSource(df_decontam) -def generate_cds_decontam(decontam, ranks): +def cds_decontam(decontam, ranks): """ cds based on a dict with valid values to plot model lines {taxid: (contam_y1, contam_y2, non_contam_y, pval)} @@ -300,7 +300,7 @@ def generate_cds_decontam(decontam, ranks): return ColumnDataSource(dict_coord_mod) -def generate_cds_plot_decontam_models(decontam): +def cds_plot_decontam_models(decontam): """ cds based on a dict with 3 pairs of values to plot. x is shared among y_cont and y_noncont # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} @@ -314,7 +314,7 @@ def generate_cds_plot_decontam_models(decontam): return ColumnDataSource(dict_decontam_models) -def generate_dict_sampleobs(table): +def dict_sampleobs(table): # dict with raw counts (not storing zeros) # dict_sampleobs[rank][obs][sample] = count dict_sampleobs = {} @@ -330,7 +330,7 @@ def generate_dict_sampleobs(table): return dict_sampleobs -def generate_cds_heatmap(table, transformation, show_zeros): +def cds_heatmap(table, transformation, show_zeros): # Stacked matrix of raw counts + transformed value # index -> sample-ids (repeated) # obs @@ -362,7 +362,7 @@ def generate_cds_heatmap(table, transformation, show_zeros): return ColumnDataSource(df_heatmap) -def generate_dict_hcluster(table, hcluster): +def dict_hcluster(table, hcluster): # keys -> combination of hclusters # values -> sorted sample-ids @@ -387,7 +387,7 @@ def generate_dict_hcluster(table, hcluster): return leaves_x, leaves_y -def generate_cds_plot_dendro(): +def cds_plot_dendro(): # Empty CDS {"x": [], "y": [], "c": []} dendro_x = {"x": [], "y": [], "c": []} dendro_y = {"x": [], "y": [], "c": []} @@ -396,7 +396,7 @@ def generate_cds_plot_dendro(): return ColumnDataSource(dendro_x), ColumnDataSource(dendro_y) -def generate_dict_dendro(table, dendro): +def dict_dendro(table, dendro): # dict_d_dedro_x and dict_d_dedro_y: # key -> key + "|x" , key + "|y" , key + "|c" # value -> list of lists (x and y) or list (c) @@ -419,7 +419,7 @@ def generate_dict_dendro(table, dendro): return dict_d_dedro_x, dict_d_dedro_y -def generate_dict_topobs(table, top_obs_bars): +def dict_topobs(table, top_obs_bars): dict_top_taxa = {} for rank in table.ranks(): dict_top_taxa[rank] = table.get_top(rank, top_obs_bars) @@ -427,7 +427,7 @@ def generate_dict_topobs(table, top_obs_bars): return dict_top_taxa -def generate_dict_refs(table, references): +def dict_refs(table, references): # dict with information about sources and references # references can be repeated among descriptions, sources and taxids # {taxid: {source: {desc: [refs]}} @@ -455,7 +455,7 @@ def generate_dict_refs(table, references): return d_refs -def generate_cds_correlation(table, corr): +def cds_correlation(table, corr): df_corr = pd.DataFrame(columns=["taxid", "rank", "rho"]) for rank in table.ranks(): stacked_rank_df = pd.DataFrame(corr[rank]["rho"], index=corr[rank]["observations"], columns=corr[rank]["observations"]).stack(dropna=False).reset_index(1) @@ -472,7 +472,7 @@ def generate_cds_correlation(table, corr): return ColumnDataSource(df_corr) -def generate_cds_obsbars(table, top_obs_bars): +def cds_obsbars(table, top_obs_bars): # index (unique sample-ids) # cols: 1, 2, ..., top_obs_bars, unassigned, others, factors @@ -495,7 +495,7 @@ def generate_cds_obsbars(table, top_obs_bars): return ColumnDataSource(df_obsbars) -def generate_cds_mgnify(mgnify, table, tax): +def cds_mgnify(mgnify, table, tax): # index (taxa, level, lineage) # count for each combination of index diff --git a/grimer/config.py b/grimer/config.py index aafa64b..c9114ed 100644 --- a/grimer/config.py +++ b/grimer/config.py @@ -18,7 +18,7 @@ def __new__(self, argv=None): parser.add_argument('-c', '--config', type=str, help="Configuration file") parser.add_argument('-m', '--metadata-file', type=str, help="Input metadata file in simple tabular format. Sample identifiers will be matched with ones provided by --input-table. QIIME 2 metadata format is also accepted, with categorical and numerical fields.") parser.add_argument('-t', '--tax', type=str, default=None, help="Define taxonomy to use. By default, do not use any taxonomy.", choices=["ncbi", "gtdb", "silva", "greengenes", "ott"]) - parser.add_argument('-b', '--tax-files', nargs="*", type=str, default=None, help="Taxonomy files. If not provided, will automatically be downloaded.") + parser.add_argument('-b', '--tax-files', nargs="*", type=str, default=[], help="Taxonomy files. If not provided, will automatically be downloaded.") parser.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") parser.add_argument('-r', '--ranks', nargs="*", default=[Config.default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + Config.default_rank_name + "' to use entries from the table directly. Default: " + Config.default_rank_name) parser.add_argument('-l', '--title', type=str, default="", help="Title to display on the header of the report.") diff --git a/grimer/func.py b/grimer/func.py index 5aeea19..4d3ce11 100644 --- a/grimer/func.py +++ b/grimer/func.py @@ -31,35 +31,49 @@ def parse_config_file(config): - """ - parse yaml configuration file - """ cfg = None if config: - with open(config, 'r') as file: - cfg = yaml.safe_load(file) + try: + with open(config, 'r') as file: + cfg = yaml.safe_load(file) + except Exception as e: + print_log("Failed loading configuration file [" + config + "], skipping") + print_log(str(e)) + else: + print_log("Not provided, skipping") return cfg def parse_taxonomy(taxonomy, tax_files): tax = None - if taxonomy == "ncbi": - tax = NcbiTx(files=tax_files, extended_names=True) - elif taxonomy == "gtdb": - tax = GtdbTx(files=tax_files) - elif taxonomy == "silva": - tax = SilvaTx(files=tax_files) - elif taxonomy == "greengenes": - tax = GreengenesTx(files=tax_files) - elif taxonomy == "ott": - tax = OttTx(files=tax_files, extended_names=True) + if taxonomy is not None: + try: + if not tax_files: + print_log("Downloading taxonomy") + if taxonomy == "ncbi": + tax = NcbiTx(files=tax_files, extended_names=True) + elif taxonomy == "gtdb": + tax = GtdbTx(files=tax_files) + elif taxonomy == "silva": + tax = SilvaTx(files=tax_files) + elif taxonomy == "greengenes": + tax = GreengenesTx(files=tax_files) + elif taxonomy == "ott": + tax = OttTx(files=tax_files, extended_names=True) + else: + raise + except Exception as e: + print_log("Failed loading " + taxonomy + " taxonomy, skipping") + print_log(str(e)) + else: + print_log("Not provided, skipping") return tax def parse_table(args, tax): table = None - # Specific params if biom file is provided + # Specific default params if biom file is provided if args.input_file.endswith(".biom"): if not args.level_separator: args.level_separator = ";" @@ -128,7 +142,6 @@ def parse_table(args, tax): def parse_metadata(args, table): metadata = None - if args.metadata_file: metadata = Metadata(metadata_file=args.metadata_file, samples=table.samples.to_list()) elif args.input_file.endswith(".biom"): @@ -137,44 +150,47 @@ def parse_metadata(args, table): if biom_in.metadata() is not None: metadata = Metadata(metadata_table=biom_in.metadata_to_dataframe(axis="sample"), samples=table.samples.to_list()) except: - metadata = None - print_log("Error parsing metadata from BIOM file") + print_log("Error parsing metadata from BIOM file, skipping") + return None - if metadata is None or metadata.data.empty: - metadata = None - print_log("No valid metadata") - else: - print_log("Samples: " + str(metadata.data.shape[0])) - print_log("Numeric Fields: " + str(metadata.get_data("numeric").shape[1])) - print_log("Categorical Fields: " + str(metadata.get_data("categorical").shape[1])) - if len(metadata.get_col_headers()) < args.metadata_cols: - args.metadata_cols = len(metadata.get_col_headers()) - print_log("") + if metadata.data.empty: + print_log("No valid metadata, skipping") + return None + + print_log("Samples: " + str(metadata.data.shape[0])) + print_log("Numeric Fields: " + str(metadata.get_data("numeric").shape[1])) + print_log("Categorical Fields: " + str(metadata.get_data("categorical").shape[1])) + if len(metadata.get_col_headers()) < args.metadata_cols: + args.metadata_cols = len(metadata.get_col_headers()) return metadata def parse_references(cfg, tax, taxonomy, ranks): references = None - if "references" in cfg and taxonomy == "ncbi": - references = {} - for desc, sf in cfg["references"].items(): - references[desc] = Reference(file=sf) - if tax: - # Update taxids / get taxid from name - references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) - for i in list(references[desc].ids.keys()): - # lineage of all parent nodes (without itself) - for l in tax.lineage(i)[:-1]: - references[desc].add_parent(l, i) + if cfg is not None and "references" in cfg: + if taxonomy == "ncbi": + references = {} + for desc, sf in cfg["references"].items(): + references[desc] = Reference(file=sf) + if tax: + # Update taxids / get taxid from name + references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) + for i in list(references[desc].ids.keys()): + # lineage of all parent nodes (without itself) + for l in tax.lineage(i)[:-1]: + references[desc].add_parent(l, i) + else: + print_log("References only possible with ncbi taxonomy, skipping") + else: + print_log("No references defined in the configuration file, skipping") return references def parse_controls(cfg, table): controls = None control_samples = None - - if "controls" in cfg: + if cfg is not None and "controls" in cfg: controls = {} control_samples = {} for desc, cf in cfg["controls"].items(): @@ -191,6 +207,8 @@ def parse_controls(cfg, table): # Add control observations as a reference controls[desc] = Reference(ids=obs) control_samples[desc] = list(valid_samples) + else: + print_log("No controls defined in the configuration file, skipping") return controls, control_samples @@ -198,16 +216,18 @@ def parse_controls(cfg, table): def parse_mgnify(run_mgnify, cfg, tax, ranks): mgnify = None if run_mgnify: - if cfg and "mgnify" in cfg["external"]: - print_log("- Parsing MGNify") - mgnify = MGnify(cfg["external"]["mgnify"], ranks=ranks) + if cfg is not None and "mgnify" in cfg["external"]: + try: + mgnify = MGnify(cfg["external"]["mgnify"], ranks=ranks) + except Exception as e: + print_log("Failed parsing MGnify database file [" + cfg["external"]["mgnify"] + "], skipping") + print_log(str(e)) if tax: mgnify.update_taxids(update_tax_nodes([tuple(x) for x in mgnify.data[["rank", "taxa"]].to_numpy()], tax)) - print_log("") else: - print("Configuration file not found. Skipping MGnify") - print_log("") - + print_log("Not defined in the configuration file, skipping") + else: + print_log("Not activated, skipping") return mgnify @@ -259,7 +279,7 @@ def parse_input_file(input_file, unassigned_header, transpose, sample_replace): # Replace text on sample labels if sample_replace: - print_log("Replacing sample label values:") + print_log("Replacing sample values:") before_replace = table_df.head(1).index #get index as series to use replace method new_index = table_df.reset_index()["index"].replace(regex=dict(zip(sample_replace[::2], sample_replace[1::2]))) @@ -290,8 +310,7 @@ def parse_input_file(input_file, unassigned_header, transpose, sample_replace): if unassigned.sum() == 0: print_log("No unassigned entries defined") - print_log("") - print_log("- Trimming table") + print_log("Trimming table") table_df = trim_table(table_df) # Filter based on the final table @@ -367,7 +386,7 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): # For every pair of replace arguments if obs_replace: - print_log("Replacing values:") + print_log("Replacing observation values:") before_replace = ranks_df.dropna().head(1).values[0] ranks_df.replace(regex=dict(zip(obs_replace[::2], obs_replace[1::2])), inplace=True) for b, a in zip(before_replace, ranks_df.dropna().head(1).values[0]): @@ -388,7 +407,7 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): ranks_df.rename(columns=parsed_ranks, inplace=True) # Update taxids - if tax: + if tax is not None: unmatched_nodes = 0 for i, r in parsed_ranks.items(): rank_nodes = ranks_df[r].dropna().unique() @@ -437,11 +456,12 @@ def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): def parse_single_table(table_df, ranks, tax, default_rank_name): # Update taxids - if tax: + if tax is not None: updated_nodes = update_tax_nodes(table_df.columns, tax) unmatched_nodes = list(updated_nodes.values()).count(tax.undefined_node) if unmatched_nodes: print_log(str(unmatched_nodes) + " observations not found in taxonomy") + for node, upd_node in updated_nodes.items(): if upd_node is not None and upd_node != node: # If updated node is a merge on an existing taxid, sum values @@ -532,10 +552,14 @@ def update_tax_nodes(nodes, tax): return updated_nodes -def run_decontam(cfg, table, metadata, control_samples): - decontam = None - if not cfg: - print("Configuration file not found. Skipping DECONTAM") +def run_decontam(run_decontam, cfg, table, metadata, control_samples): + + if not run_decontam: + print_log("Not activated, skipping") + return None + + if cfg is None: + print_log("Not defined in the configuration file, skipping") return None df_decontam = pd.DataFrame(index=table.samples, columns=["concentration", "controls"]) @@ -555,24 +579,24 @@ def run_decontam(cfg, table, metadata, control_samples): df_decontam["concentration"] = pd.read_table(cfg_decontam["frequency_file"], sep='\t', header=None, skiprows=0, index_col=0).reindex(table.samples) # If any entry is unknown, input is incomplete if df_decontam["concentration"].isnull().values.any(): - print_log("File " + cfg_decontam["frequency_file"] + " is incomplete (Missing: " + ",".join(df_decontam[df_decontam.isnull().any(axis=1)].index.to_list()) + ") Skipping DECONTAM.") + print_log("File " + cfg_decontam["frequency_file"] + " is incomplete (Missing: " + ",".join(df_decontam[df_decontam.isnull().any(axis=1)].index.to_list()) + "), skipping") return None else: - print_log("File " + cfg_decontam["frequency_file"] + " not found. Skipping DECONTAM.") + print_log("File " + cfg_decontam["frequency_file"] + " not found, skipping") return None elif "frequency_metadata" in cfg_decontam: if cfg_decontam["frequency_metadata"] in metadata.get_col_headers(): # Get concentrations from metadata df_decontam["concentration"] = metadata.get_col(cfg_decontam["frequency_metadata"]) else: - print_log("Could not find " + cfg_decontam["frequency_metadata"] + " in the metadata. Skipping DECONTAM.") + print_log("Could not find " + cfg_decontam["frequency_metadata"] + " in the metadata, skipping.") return None elif not table.normalized: # Use total from table print_log("No concentration provided, using total counts as concentration (frequency for DECONTAM)") df_decontam["concentration"] = table.total else: - print_log("Cannot run DECONTAM without concentration and normalized values") + print_log("Cannot run DECONTAM without defined concentration and normalized input values, skipping") return None # Print concentrations to file df_decontam["concentration"].to_csv(out_concentration, sep="\t", header=False, index=True) @@ -606,14 +630,12 @@ def run_decontam(cfg, table, metadata, control_samples): print("\n".join(df_decontam.index[df_decontam["controls"]]), file=outf) outf.close() else: - print("Could not find valid control entries. Skipping DECONTAM") + print("Could not find valid control entries, skipping") return None decontam = Decontam(df_decontam) - # Run DECONTAM for each for each for rank in table.ranks(): - if len(table.observations(rank)) == 1: decontam.add_rank_empty(rank, table.observations(rank)) else: @@ -638,6 +660,7 @@ def run_decontam(cfg, table, metadata, control_samples): for file in [out_table, out_concentration, out_controls, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv"]: if os.path.isfile(file): os.remove(file) + return decontam @@ -716,6 +739,17 @@ def dendro_lines_color(dendro, axis): return icoord, dcoord, colors +def pairwise_vlr(mat): + cov = np.cov(mat.T, ddof=1) + diagonal = np.diagonal(cov) + return -2 * cov + diagonal[:, np.newaxis] + diagonal + + +def pairwise_rho(mat): + variances = np.var(mat, axis=0, ddof=1) + return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) + + def include_scripts(scripts): # Insert global js functions and css and return template template = "{% block postamble %}" @@ -727,16 +761,6 @@ def include_scripts(scripts): template += "{% endblock %}" return template -def pairwise_vlr(mat): - cov = np.cov(mat.T, ddof=1) - diagonal = np.diagonal(cov) - return -2 * cov + diagonal[:, np.newaxis] + diagonal - - -def pairwise_rho(mat): - variances = np.var(mat, axis=0, ddof=1) - return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) - def format_js_toString(val): # Transform numeric value to float and string to match toString diff --git a/grimer/grimer.py b/grimer/grimer.py index ce3f4a6..edac13d 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -2,7 +2,6 @@ _debug = False #General -import argparse import sys #Internal @@ -13,7 +12,6 @@ from grimer.plots import * from grimer.func import * - #Bokeh from bokeh.io import save from bokeh.plotting import output_file @@ -21,15 +19,15 @@ def main(argv=sys.argv[1:]): """ - GRIMER steps + GRIMER code overview 1) Load data/analysis: parse configuration, load files and run analysis into data objects e.g. args.input_file to Table() and decontam 2) Generata data sources: Convert objects and analysis int cds/dict e.g. table to cds_m_obstable - 3) Plot figures and elements based on cds/dict (and some objects) + 3) Plot elements: plot figures and widgets based on cds/dict (and some objects) e.g cds_m_obstable to ele["obstable"]["fig"] - 4) Link javascript callbacks between elements and cds/dict - 5) Put elements into layout and generate report + 4) Link javascript: link data sources and javascript custom callbacks + 5) Draw layout: Put elements into layout scheme and generate report """ # Parse CLI arguments @@ -73,7 +71,7 @@ def main(argv=sys.argv[1:]): mgnify = parse_mgnify(args.mgnify, cfg, tax, table.ranks()) print_log("- Running DECONTAM") - decontam = run_decontam(cfg, table, metadata, control_samples) + decontam = run_decontam(args.decontam, cfg, table, metadata, control_samples) print_log("- Running hiearchical clustering") hcluster, dendro = run_hclustering(table, args.linkage_methods, args.linkage_metrics, args.transformation, args.skip_dendrogram, args.optimal_ordering) @@ -91,64 +89,60 @@ def main(argv=sys.argv[1:]): # _m_ : mixed -> contain both plot and data properties print_log("- Generating data sources") - # _m_ # df: index (unique observations), col|..., tax|..., aux|ref - cds_m_obstable = generate_cds_obstable(table, tax, references, controls, control_samples, decontam) - + cds_m_obstable = cds_obstable(table, tax, references, controls, control_samples, decontam) # _p_ # df: index (unique sample-ids), aux|..., bar|..., tax|... - cds_p_samplebars = generate_cds_samplebars(table) + cds_p_samplebars = cds_samplebars(table) # stacked: index (repeated observations), rank, ref, direct, parent - cds_p_references = generate_cds_plot_references(table, tax, references) + cds_p_references = cds_plot_references(table, tax, references) # matrix: index (unique sample-ids), concentrations, controls, counts - cds_p_decontam = generate_cds_plot_decontam(decontam) if decontam else None + cds_p_decontam = cds_plot_decontam(decontam) if decontam else None # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} - cds_p_decontam_models = generate_cds_plot_decontam_models(decontam) if decontam else None + cds_p_decontam_models = cds_plot_decontam_models(decontam) if decontam else None # stacked: index (taxa, level, lineage), count, perc - cds_p_mgnify = generate_cds_mgnify(mgnify, table, tax) if mgnify else None + cds_p_mgnify = cds_mgnify(mgnify, table, tax) if mgnify else None # stacked: index (repeated sample-ids), obs, rank, ov, tv - cds_p_heatmap = generate_cds_heatmap(table, args.transformation, args.show_zeros) + cds_p_heatmap = cds_heatmap(table, args.transformation, args.show_zeros) # matrix: index (unique sample-ids), md0, md1, ..., md(args.metadata_cols) -> (metadata field, metadata values) - cds_p_metadata = generate_cds_plot_metadata(metadata, args.metadata_cols) if metadata else None + cds_p_metadata = cds_plot_metadata(metadata, args.metadata_cols) if metadata else None # stacked: index (repeated observations), rank, annot - cds_p_annotations = generate_cds_annotations(table, references, controls, decontam, control_samples) + cds_p_annotations = cds_annotations(table, references, controls, decontam, control_samples) # empty matrix {"x": [], "y": [], "c": []} - cds_p_dendro_x, cds_p_dendro_y = generate_cds_plot_dendro() if not args.skip_dendrogram else [None, None] + cds_p_dendro_x, cds_p_dendro_y = cds_plot_dendro() if not args.skip_dendrogram else [None, None] # stacked: index (repeated observations), other observation, rank, rho - cds_p_correlation = generate_cds_correlation(table, corr) + cds_p_correlation = cds_correlation(table, corr) # matrix: index (unique sample-ids), 0, 1, ..., top_obs_bars, unassigned, others, factors - cds_p_obsbars = generate_cds_obsbars(table, args.top_obs_bars) + cds_p_obsbars = cds_obsbars(table, args.top_obs_bars) # df: index (unique sample-ids), col|... - cds_p_sampletable = generate_cds_sampletable(table) - + cds_p_sampletable = cds_sampletable(table) # _d_ # df: index (unique sample-ids), aux|..., cnt|..., - cds_d_samples = generate_cds_samples(table, references, controls, decontam) + cds_d_samples = cds_samples(table, references, controls, decontam) # matrix: index (unique sample-ids) x columns (metadata fields) -> metadata values - cds_d_metadata = generate_cds_metadata(metadata) if metadata else None + cds_d_metadata = cds_metadata(metadata) if metadata else None # {taxid: (contam_y1, contam_y2, non_contam_y, pval)} - cds_d_decontam = generate_cds_decontam(decontam, table.ranks()) if decontam else None + cds_d_decontam = cds_decontam(decontam, table.ranks()) if decontam else None # key = rank + "|" + method + "|" + metric # y: {"default": sorted sample-ids, key: sorted sample-ids, ...} # x: {"default|rank": sorted sample-ids, key: sorted sample-ids, ...} - dict_d_hcluster_x, dict_d_hcluster_y = generate_dict_hcluster(table, hcluster) + dict_d_hcluster_x, dict_d_hcluster_y = dict_hcluster(table, hcluster) # {key+"|x": x-values, key+"|y": y-values , key+"|c": colors} - dict_d_dedro_x, dict_d_dedro_y = generate_dict_dendro(table, dendro) if not args.skip_dendrogram else [None, None] + dict_d_dedro_x, dict_d_dedro_y = dict_dendro(table, dendro) if not args.skip_dendrogram else [None, None] # {taxid: name} - dict_d_taxname = generate_dict_taxname(tax, [txid for rank in table.ranks() for txid in table.observations(rank)]) + dict_d_taxname = dict_taxname(tax, [txid for rank in table.ranks() for txid in table.observations(rank)]) # {rank: [taxid1,taxid2, ..., taxid(top_obs_bars)]} - dict_d_topobs = generate_dict_topobs(table, args.top_obs_bars) + dict_d_topobs = dict_topobs(table, args.top_obs_bars) # {taxid: {source: {desc: [refs]}} - dict_d_refs = generate_dict_refs(table, references) + dict_d_refs = dict_refs(table, references) # dict: {rank: {obs: {sample: count}}} - dict_d_sampleobs = generate_dict_sampleobs(table) + dict_d_sampleobs = dict_sampleobs(table) - ############ PLOT ELEMENTS (Figures, Widgets, ...) - ############ "fig": main figure - ############ "wid": widgets + # 3) Plot elements + print_log("- Plotting elements") - # Layout and plot sizes + # Defined fixed layout and plot sizes sizes = {} sizes["overview_top_panel_height"] = 300 sizes["overview_top_panel_width_left"] = 250 @@ -247,10 +241,10 @@ def main(argv=sys.argv[1:]): ele["obsbars"]["wid"] = plot_obsbars_widgets(table.ranks(), metadata, dict_d_topobs, dict_d_taxname, args.top_obs_bars) ele["obsbars"]["fig"], ele["obsbars"]["legend"] = plot_obsbars(cds_p_obsbars, dict_d_topobs, table.ranks(), args.top_obs_bars, dict_d_taxname, ele["obsbars"]["wid"]["rank_select"]) - ############ JAVASCRIPT LINKING + #4) Link javascript: + print_log("- Linking javascript") link_obstable_filter(ele, cds_m_obstable, table.ranks()) - link_obstable_samplebars(ele, cds_m_obstable, cds_p_samplebars, @@ -267,7 +261,6 @@ def main(argv=sys.argv[1:]): cds_p_mgnify, dict_d_refs, dict_d_taxname) - link_heatmap_widgets(ele, cds_d_samples, cds_d_metadata, @@ -283,11 +276,8 @@ def main(argv=sys.argv[1:]): cds_p_heatmap, table.ranks(), dict_d_taxname) - link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, args.metadata_cols) - link_correlation_widgets(ele, cds_p_correlation) - link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, @@ -297,11 +287,10 @@ def main(argv=sys.argv[1:]): dict_d_taxname, cds_d_metadata, cds_p_sampletable) - link_sampletable_select(ele, cds_p_sampletable, cds_d_metadata) - ############ LAYOUT - + # 5) Draw layout + print_log("- Drawing layout") # Define path of running script to get static files script_dir, _ = os.path.split(__file__) logo_path = os.path.join(script_dir, "img", "logo.png") @@ -320,6 +309,7 @@ def main(argv=sys.argv[1:]): mode = "cdn" # configure to load Bokeh JS and CSS from https://cdn.bokeh.org # setup output file and JS mode + print_log("- Saving report") output_file(args.output_html, title="GRIMER" if not args.title else "GRIMER - " + args.title, mode=mode) save(final_layout, template=template) print_log("File: " + args.output_html) diff --git a/grimer/utils.py b/grimer/utils.py deleted file mode 100644 index 6507a45..0000000 --- a/grimer/utils.py +++ /dev/null @@ -1,654 +0,0 @@ -#General -import numpy as np -import os -import sys -import subprocess -import shlex -import pandas as pd - -#Internal -from grimer.decontam import Decontam -from grimer.reference import Reference - -# Bokeh -from bokeh.palettes import Category10, Category20, Colorblind, linear_palette, Turbo256 - -#biom -import biom - -# scikit-bio -from skbio.stats.composition import clr - -# Scipy -import scipy.cluster.hierarchy as sch - - -def parse_input_table(input_file, unassigned_header, transpose, sample_replace): - - if input_file.endswith(".biom"): - table_df = biom.load_table(input_file).to_dataframe(dense=True) - else: - # Default input_file: index=observations, columns=samples - # table_df should have samples on indices and observations on columns - table_df = pd.read_table(input_file, sep='\t', index_col=0).transpose().fillna(0) - - # If user is providing a reverse table, turn back - if transpose: - table_df = table_df.transpose() - - # Remove header on rows - table_df.index.name = None - - # Replace text on sample labels - if sample_replace: - print_log("Replacing sample label values:") - before_replace = table_df.head(1).index - #get index as series to use replace method - new_index = table_df.reset_index()["index"].replace(regex=dict(zip(sample_replace[::2], sample_replace[1::2]))) - table_df.set_index(new_index, inplace=True) - for b, a in zip(before_replace, table_df.head(1).index): - print_log(" " + b + " -> " + a) - print_log(" ...") - - # Sum total before split unassigned or filter - total = table_df.sum(axis=1) - - # unique unassigned/unclassified for table - # Separate unassigned counts column from main data frame - unassigned = pd.Series(0, index=table_df.index) - if unassigned_header: - for header in unassigned_header: - if header in table_df.columns: - if isinstance(table_df[header], pd.DataFrame): - # Sum in case there are several equally named headers - unassigned += table_df[header].sum(axis=1) - else: - # return a pd.Series - unassigned += table_df[header] - table_df.drop(columns=header, inplace=True) - else: - print_log("'" + header + "' header not found") - - if unassigned.sum() == 0: - print_log("No unassigned entries defined") - - print_log("") - print_log("- Trimming table") - table_df = trim_table(table_df) - - # Filter based on the final table - unassigned = unassigned.reindex(table_df.index) - total = total.reindex(table_df.index) - - return table_df, total, unassigned - - -def filter_input_table(table_df, total, min_frequency, max_frequency, min_count, max_count, normalized): - - if min_count: - cnt = table_df.sum().sum() - if min_count < 1: - table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df - table_df = table_df[table_df_norm >= min_count].fillna(0) - elif min_count > 1: - table_df = table_df[table_df >= min_count].fillna(0) - print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --min-count " + str(min_count)) - - if max_count: - cnt = table_df.sum().sum() - if max_count < 1: - table_df_norm = transform_table(table_df, total, "norm", 0) if not normalized else table_df - table_df = table_df[table_df_norm <= max_count].fillna(0) - elif max_count > 1: - table_df = table_df[table_df <= max_count].fillna(0) - print_log(str(int(cnt - table_df.sum().sum())) + " counts skipped with --max-count " + str(max_count)) - - if min_frequency: - cnt = table_df.shape[1] - table_df_freq = table_df.gt(0).sum(axis=0) - if min_frequency < 1: - table_df_freq = table_df_freq / table_df.shape[0] - table_df = table_df.loc[:, table_df_freq >= min_frequency] - elif min_frequency > 1: - table_df = table_df.loc[:, table_df_freq >= min_frequency] - print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --min-frequency " + str(min_frequency)) - - if max_frequency: - cnt = table_df.shape[1] - table_df_freq = table_df.gt(0).sum(axis=0) - if max_frequency < 1: - table_df_freq = table_df_freq / table_df.shape[0] - table_df = table_df.loc[:, table_df_freq <= max_frequency] - elif max_frequency > 1: - table_df = table_df.loc[:, table_df_freq <= max_frequency] - print_log(str(int(cnt - table_df.shape[1])) + " observations removed with --max-frequency " + str(max_frequency)) - - return table_df - - -def trim_table(table_df): - # Check for cols/rows with sum zero - zero_rows = table_df.sum(axis=1).eq(0) - if any(zero_rows): - table_df = table_df.loc[~zero_rows, :] - print_log(str(sum(zero_rows)) + " samples with only zero removed") - - zero_cols = table_df.sum(axis=0).eq(0) - if any(zero_cols): - table_df = table_df.loc[:, ~zero_cols] - print_log(str(sum(zero_cols)) + " observations with only zero removed") - - return table_df - - -def parse_multi_table(table_df, ranks, tax, level_separator, obs_replace): - from grimer.grimer import _debug - - # Transpose table (obseravations as index) and expand ranks in columns - ranks_df = table_df.T.index.str.split(level_separator, expand=True).to_frame(index=False) - - # For every pair of replace arguments - if obs_replace: - print_log("Replacing values:") - before_replace = ranks_df.dropna().head(1).values[0] - ranks_df.replace(regex=dict(zip(obs_replace[::2], obs_replace[1::2])), inplace=True) - for b, a in zip(before_replace, ranks_df.dropna().head(1).values[0]): - print_log(" " + b + " -> " + a) - print_log(" ...") - - # replace entirely space or empty with NaN - ranks_df = ranks_df.replace(r'^\s*$', np.nan, regex=True) - - # Set rank names, matching user defined or default - user_ranks = False - if len(ranks) == ranks_df.shape[1]: - parsed_ranks = {r: ranks[r] for r in range(ranks_df.shape[1])} - user_ranks = True - else: - print_log("Ranks provided (" + str(len(ranks)) + ") do not match file (" + str(ranks_df.shape[1]) + " levels). Using default named ranks.") - parsed_ranks = {r: "rank-" + str(r) for r in range(ranks_df.shape[1])} - ranks_df.rename(columns=parsed_ranks, inplace=True) - - # Update taxids - if tax: - unmatched_nodes = 0 - for i, r in parsed_ranks.items(): - rank_nodes = ranks_df[r].dropna().unique() - - # If there is at least one valid entry - if rank_nodes.any(): - # If user-provided ranks are matching, update nodes with rank - if user_ranks: - updated_nodes = {node: unode for (rank, node), unode in update_tax_nodes([(r, n) for n in rank_nodes], tax).items()} - else: - updated_nodes = update_tax_nodes(rank_nodes, tax) - - # Add nan to keep missing ranks (different than tax.undefined_node [None] which will keep the name) - updated_nodes[np.nan] = np.nan - ranks_df[r] = ranks_df[r].map(lambda t: updated_nodes[t] if updated_nodes[t] is not None else t) - del updated_nodes[np.nan] - - unmatched_nodes += list(updated_nodes.values()).count(tax.undefined_node) - - if unmatched_nodes: - print_log(str(unmatched_nodes) + " observations not found in taxonomy (but kept)") - - # Check unique lineage - for i, r in parsed_ranks.items(): - if i > 0: - lin_count = ranks_df.iloc[:, :i+1].drop_duplicates().groupby(r).count() - invalid = lin_count[(lin_count > 1).any(axis=1)].index.to_list() - if invalid: - print_log(str(len(invalid)) + " observations removed with invalid lineage at " + r) - if _debug: - print_log(",".join(invalid) + " observations removed with invalid lineage at " + r) - # Set to NaN to keep shape of ranks_df - ranks_df.loc[ranks_df[r].isin(invalid), r] = np.nan - - ranked_tables = {} - for i, r in parsed_ranks.items(): - # ranks_df and table_df.T have the same shape - ranked_table_df = pd.concat([ranks_df[r], table_df.T.reset_index(drop=True)], axis=1) - ranked_tables[r] = ranked_table_df.groupby([r], dropna=True).sum().T - ranked_tables[r].columns.name = None - - lineage = ranks_df - return ranked_tables, lineage - - -def parse_single_table(table_df, ranks, tax, default_rank_name): - - # Update taxids - if tax: - updated_nodes = update_tax_nodes(table_df.columns, tax) - unmatched_nodes = list(updated_nodes.values()).count(tax.undefined_node) - if unmatched_nodes: - print_log(str(unmatched_nodes) + " observations not found in taxonomy") - for node, upd_node in updated_nodes.items(): - if upd_node is not None and upd_node != node: - # If updated node is a merge on an existing taxid, sum values - if upd_node in table_df: - table_df[upd_node] += table_df[node] - table_df.drop(columns=node, inplace=True) - print_log("Updated and merged taxonomic nodes: " + node + " -> " + upd_node) - else: - table_df.rename(columns={node: upd_node}, inplace=True) - print_log("Updated taxonomic node: " + node + " -> " + upd_node) - - # Generate ranks - ranked_tables = {} - for rank in ranks: - # Special case for "default" rank - if rank == default_rank_name: - ranked_tables[rank] = table_df - else: - taxid_parent_rank = {i: tax.parent_rank(tax.latest(i), rank) for i in table_df.columns} - rank_df = pd.DataFrame(index=table_df.index) - for taxid, parent_rank_taxid in taxid_parent_rank.items(): - if parent_rank_taxid is None: - #no_rank += 1 - continue - if parent_rank_taxid not in rank_df: - rank_df[parent_rank_taxid] = 0 - rank_df[parent_rank_taxid] += table_df[taxid] - - if not rank_df.empty: - ranked_tables[rank] = rank_df - - # Generate lineage - if tax: - lineage = pd.DataFrame(list(map(lambda t: tax.lineage(t, ranks=list(ranked_tables.keys())), table_df.columns)), columns=list(ranked_tables.keys())) - else: - lineage = pd.DataFrame() - - return ranked_tables, lineage - - -def transform_table(df, total_counts, transformation, replace_zero_value): - # Special case clr with one observation (result in zeros) - if transformation == "clr" and df.shape[1] == 1: - print_log("WARNING: using log instead of clr with one observation") - transformation = "log" - - if transformation == "log": - transformed_df = (df + replace_zero_value).apply(np.log10) - elif transformation == "clr": - transformed_df = pd.DataFrame(clr(df + replace_zero_value), index=df.index, columns=df.columns) - elif transformation == "norm": - transformed_df = df.divide(total_counts, axis=0) + replace_zero_value - else: - transformed_df = df + replace_zero_value - - return transformed_df - - -def update_tax_nodes(nodes, tax): - """ - nodes can be a list of strings: taxids or names or a list of tuples with (rank, taxid/name) - Return a dictionary mapping nodes and updated nodes (or None) - First look for id, if nothing found, lookup by unique name - """ - - updated_nodes = {} - for node in nodes: - if isinstance(node, tuple): - r = node[0] - n = node[1] - else: - r = None - n = node - - # Either returns same node, updated or tax.undefined_node (None) - updated_taxid = tax.latest(n) - if updated_taxid: - # Assign updated or same taxid - updated_nodes[node] = updated_taxid - else: - names = tax.search_name(n, rank=r, exact=True) - # Assign taxid if found unique name only - if names and len(names) == 1: - updated_nodes[node] = names[0] - else: - updated_nodes[node] = tax.undefined_node - - return updated_nodes - - -def run_decontam(cfg, table, metadata, control_samples, normalized): - - if not cfg: - print("Configuration file not found. Skipping DECONTAM") - return None - - df_decontam = pd.DataFrame(index=table.samples, columns=["concentration", "controls"]) - cfg_decontam = cfg["external"]["decontam"] - tmp_output_prefix = "tmp_" - - # Collect metadata for DECONTAM (concentrations to use frequency and control for prevalence) - out_table = tmp_output_prefix + "table_counts.tsv" - out_concentration = tmp_output_prefix + "concentration_counts.tsv" - out_controls = tmp_output_prefix + "control_samples_list.txt" - if cfg_decontam["method"] in ["frequency", "combined"]: - out_concentration = tmp_output_prefix + "concentration_counts.tsv" - # Load frequency file, if provided - if "frequency_file" in cfg_decontam: - if os.path.isfile(cfg_decontam["frequency_file"]): - # Load concentrations from file and sort (reindex) based on table inputs - df_decontam["concentration"] = pd.read_table(cfg_decontam["frequency_file"], sep='\t', header=None, skiprows=0, index_col=0).reindex(table.samples) - # If any entry is unknown, input is incomplete - if df_decontam["concentration"].isnull().values.any(): - print_log("File " + cfg_decontam["frequency_file"] + " is incomplete (Missing: " + ",".join(df_decontam[df_decontam.isnull().any(axis=1)].index.to_list()) + ") Skipping DECONTAM.") - return None - else: - print_log("File " + cfg_decontam["frequency_file"] + " not found. Skipping DECONTAM.") - return None - elif "frequency_metadata" in cfg_decontam: - if cfg_decontam["frequency_metadata"] in metadata.get_col_headers(): - # Get concentrations from metadata - df_decontam["concentration"] = metadata.get_col(cfg_decontam["frequency_metadata"]) - else: - print_log("Could not find " + cfg_decontam["frequency_metadata"] + " in the metadata. Skipping DECONTAM.") - return None - elif not normalized: - # Use total from table - print_log("No concentration provided, using total counts as concentration (frequency for DECONTAM)") - df_decontam["concentration"] = table.total - else: - print_log("Cannot run DECONTAM without concentration and normalized values") - return None - # Print concentrations to file - df_decontam["concentration"].to_csv(out_concentration, sep="\t", header=False, index=True) - - if cfg_decontam["method"] in ["prevalence", "combined"]: - control_list = set() - if "prevalence_file" in cfg_decontam: - for file in cfg_decontam["prevalence_file"]: - if os.path.isfile(file): - # Load controls from file - control_list.update([line.rstrip() for line in open(file)]) - else: - print_log("File not found " + file) - elif "prevalence_metadata" in cfg_decontam: - for field, value in cfg_decontam["prevalence_metadata"].items(): - if field in metadata.get_col_headers(): - control_list.update(metadata.get_subset(field, value).index) - else: - print_log("Could not find " + field + " in the metadata.") - else: - # Use all samples passed as controls - for cs in control_samples.values(): - control_list.update(cs) - - # Select valid controls - df_decontam["controls"] = table.samples.isin(control_list) - - if df_decontam["controls"].any(): - print_log(str(df_decontam["controls"].sum()) + " valid control samples to be used by DECONTAM") - outf = open(out_controls, "w") - print("\n".join(df_decontam.index[df_decontam["controls"]]), file=outf) - outf.close() - else: - print("Could not find valid control entries. Skipping DECONTAM") - return None - - decontam = Decontam(df_decontam) - - # Run DECONTAM for each for each - for rank in table.ranks(): - - if len(table.observations(rank)) == 1: - decontam.add_rank_empty(rank, table.observations(rank)) - else: - # normalize and write temporary table for each rank - if not normalized: - transform_table(table.data[rank], table.total[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) - else: - table.data[rank].to_csv(out_table, sep="\t", header=True, index=True) - - cmd = " ".join(["scripts/run_decontam.R", - "--resout " + tmp_output_prefix + "decontam_out.tsv", - "--modout " + tmp_output_prefix + "decontam_mod.tsv", - "--counts " + out_table, - "--concentrations " + out_concentration if cfg_decontam["method"] in ["frequency", "combined"] else "", - "--controls " + out_controls if cfg_decontam["method"] in ["prevalence", "combined"] else "", - "--method " + cfg_decontam["method"], - "--threshold " + str(cfg_decontam["threshold"])]) - stdout, stderr = run_cmd(cmd) - - decontam.add_rank_results(rank, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv") - - for file in [out_table, out_concentration, out_controls, tmp_output_prefix + "decontam_out.tsv", tmp_output_prefix + "decontam_mod.tsv"]: - if os.path.isfile(file): - os.remove(file) - return decontam - - -def run_hclustering(table, linkage_methods, linkage_metrics, transformation, replace_zero_value, skip_dendrogram, optimal_ordering): - hcluster = {} - dendro = {} - - for rank in table.ranks(): - - # Get .values of transform, numpy array - matrix = transform_table(table.data[rank], table.total, transformation, replace_zero_value).values - - hcluster[rank] = {} - dendro[rank] = {} - for method in linkage_methods: - hcluster[rank][method] = {} - dendro[rank][method] = {} - for metric in linkage_metrics: - hcluster[rank][method][metric] = {} - hcluster[rank][method][metric]["x"] = {} - hcluster[rank][method][metric]["y"] = {} - - #H.clustering, returning dendrogram - # Only one observation does not cluster - if matrix.shape[1] > 1: - x = sch.dendrogram(sch.linkage(matrix.transpose(), method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True) - hcluster[rank][method][metric]["x"]["index"] = table.observations(rank)[x["leaves"]].to_list() - else: - hcluster[rank][method][metric]["x"]["index"] = table.observations(rank).to_list() - - # Only one samples does not cluster - if matrix.shape[0] > 1: - y = sch.dendrogram(sch.linkage(matrix, method=method, metric=metric, optimal_ordering=optimal_ordering), no_plot=True) - hcluster[rank][method][metric]["y"]["index"] = table.samples[y["leaves"]].to_list() - else: - hcluster[rank][method][metric]["y"]["index"] = table.samples.to_list() - - if not skip_dendrogram: - dendro[rank][method][metric] = {} - dendro[rank][method][metric]["y"] = {} - dendro[rank][method][metric]["x"] = {} - - # Save dendrogram values and colors - xs, ys, colors = [[]] * 3 - if matrix.shape[1] > 1: - xs, ys, colors = dendro_lines_color(x, "x") - dendro[rank][method][metric]["x"]["xs"] = xs - dendro[rank][method][metric]["x"]["ys"] = ys - dendro[rank][method][metric]["x"]["colors"] = colors - if matrix.shape[0] > 1: - xs, ys, colors = dendro_lines_color(y, "y") - dendro[rank][method][metric]["y"]["xs"] = xs - dendro[rank][method][metric]["y"]["ys"] = ys - dendro[rank][method][metric]["y"]["colors"] = colors - - return hcluster, dendro - - -def dendro_lines_color(dendro, axis): - icoord = pd.DataFrame(dendro["icoord"]) - icoord = icoord * ((len(dendro["icoord"]) + 0.5) / icoord.max().max()) - icoord = icoord.values.tolist() - if axis == "y": - dcoord = dendro["dcoord"] - else: - dcoord = [[-j for j in i] for i in dendro['dcoord']] - - color_list = dendro["color_list"] - unique_colors = sorted(set(color_list)) - cp = make_color_palette(len(unique_colors)) - colors = [cp[unique_colors.index(colorid)] for colorid in color_list] - - if axis == "y": - return dcoord, icoord, colors - else: - return icoord, dcoord, colors - - -def include_scripts(scripts): - # Insert global js functions and css and return template - template = "{% block postamble %}" - for file, t in scripts.items(): - with open(file, 'r') as file: - template += "<" + t + ">" - template += "".join(file.readlines()) - template += "" - template += "{% endblock %}" - return template - - -def parse_references(cfg, tax, ranks): - references = {} - - for desc, sf in cfg["references"].items(): - references[desc] = Reference(file=sf) - if tax: - # Update taxids / get taxid from name - references[desc].update_taxids(update_tax_nodes(references[desc].ids, tax)) - for i in list(references[desc].ids.keys()): - # lineage of all parent nodes (without itself) - for l in tax.lineage(i)[:-1]: - references[desc].add_parent(l, i) - - return references - - -def parse_controls(cfg, table): - controls = {} - control_samples = {} - - for desc, cf in cfg["controls"].items(): - with open(cf, "r") as file: - samples = file.read().splitlines() - obs = set() - valid_samples = set() - for rank in table.ranks(): - # Retrieve sub-table for every rank - control_table = table.get_subtable(rank, samples=samples) - obs.update(control_table.columns.to_list()) - valid_samples.update(control_table.index.to_list()) - - # Add control observations as a reference - controls[desc] = Reference(ids=obs) - control_samples[desc] = list(valid_samples) - - return controls, control_samples - - -def pairwise_vlr(mat): - cov = np.cov(mat.T, ddof=1) - diagonal = np.diagonal(cov) - return -2 * cov + diagonal[:, np.newaxis] + diagonal - - -def pairwise_rho(mat): - variances = np.var(mat, axis=0, ddof=1) - return 1 - (pairwise_vlr(mat) / np.add.outer(variances, variances)) - - -def format_js_toString(val): - # Transform numeric value to float and string to match toString - return str(float(val)) if isinstance(val, (int, float)) else str(val) - - -def make_color_palette(n_colors, linear: bool=False, palette: dict=None): - if isinstance(palette, dict) and n_colors <= max(palette.keys()): - # Special case for 1 and 2 (not in palettes) - palette = palette[3 if n_colors < 3 else n_colors] - - if linear or n_colors > 20: - if not palette: - palette = Turbo256 - if n_colors <= 256: - return linear_palette(palette, n_colors) - else: - # Repeat colors - return [palette[int(i * 256.0 / n_colors)] for i in range(n_colors)] - else: - # Select color palette based on number of requested colors - # Return the closest palette with most distinc set of colors - if not palette: - if n_colors <= 8: - palette = Colorblind[8] - elif n_colors <= 10: - palette = Category10[10] - elif n_colors <= 20: - palette = Category20[20] - else: - palette = Turbo256 - - return palette[:n_colors] - -def run_cmd(cmd, print_stderr: bool=False, exit_on_error: bool=True): - errcode = 0 - stdout = "" - stderr = "" - try: - process = subprocess.Popen(shlex.split(cmd), - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - # wait for the process to terminate - stdout, stderr = process.communicate() - errcode = process.returncode - if exit_on_error and errcode != 0: - raise Exception() - if print_stderr and stderr: - print_log(stderr) - - except Exception as e: - print_log('The following command failed to run:\n' + cmd) - print_log(str(e)) - print_log("Error code: " + str(errcode)) - print_log("Out: ") - if stdout: - print_log(stdout) - print_log("Error: ") - if stderr: - print_log(stderr) - sys.exit(errcode) - - return stdout, stderr - - -def print_log(text): - sys.stderr.write(text + "\n") - sys.stderr.flush() - - -def print_df(df, name: str=None): - from grimer.grimer import _debug - if _debug: - print(name) - if isinstance(df, dict): - if df: - print(len(df.keys()), "keys:", list(df.keys())[0], "...", list(df.keys())[-1]) - #print(list(df.values())[0], "...", list(df.values())[-1]) - else: - #print(df.columns) - print(df.head()) - print(df.shape) - print("size:", sys.getsizeof(df)) - print("-----------------------------------------------") - - -def print_logo_cli(version): - print_log("==================") - print_log(" ╔═╗╦═╗╦╔╦╗╔═╗╦═╗ ") - print_log(" ║ ╦╠╦╝║║║║║╣ ╠╦╝ ") - print_log(" ╚═╝╩╚═╩╩ ╩╚═╝╩╚═ ") - print_log(" v" + version) - print_log("==================") From bd220b2ec0a58bc9a04b9dfc98f49e63edf2f809 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Sat, 5 Mar 2022 12:12:58 +0100 Subject: [PATCH 46/50] output panels, better error parsing table, argparse --- grimer/cds.py | 18 ++--- grimer/config.py | 60 +++++++------- grimer/func.py | 19 ++--- grimer/grimer.py | 12 ++- grimer/layout.py | 205 ++++++++++++++++++++++------------------------- 5 files changed, 156 insertions(+), 158 deletions(-) diff --git a/grimer/cds.py b/grimer/cds.py index 23812cf..af68fa7 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -4,7 +4,7 @@ from math import pi #Internal -from grimer.func import print_df, transform_table, print_log, pairwise_rho, format_js_toString +from grimer.func import print_df, transform_table, print_log, format_js_toString #Bokeh from bokeh.models import ColumnDataSource @@ -173,7 +173,7 @@ def cds_sampletable(table): # assigned by rank for rank in table.ranks(): - df_sampletable["col|" + rank] = table.data[rank].sum(axis=1).divide(table.total, axis=0) + df_sampletable["col|" + rank] = table.data[rank].sum(axis=1).divide(table.get_total(), axis=0) df_sampletable.fillna(0, inplace=True) @@ -211,11 +211,11 @@ def cds_samples(table, references, controls, decontam): df_samples = pd.DataFrame(index=table.samples) # index to retrieve default input order df_samples["aux|input_order"] = range(df_samples.shape[0], 0, -1) - df_samples["cnt|total"] = table.total - df_samples["cnt|unassigned"] = table.unassigned + df_samples["cnt|total"] = table.get_total() + df_samples["cnt|unassigned"] = table.get_unassigned() # Keep total number of assignemnts for calculations - df_samples["cnt|assigned"] = table.total - table.unassigned + df_samples["cnt|assigned"] = table.get_total() - table.get_unassigned() # Add specific rank assignements for rank in table.ranks(): @@ -344,7 +344,7 @@ def cds_heatmap(table, transformation, show_zeros): # Rename first col to obs stacked_rank_df.rename(columns={stacked_rank_df.columns[0]: "obs"}, inplace=True) stacked_rank_df["rank"] = rank - tv = transform_table(table.data[rank], table.total, transformation, table.zerorep) + tv = transform_table(table.data[rank], table.get_total(), transformation, table.zerorep) stacked_rank_df["tv"] = tv.stack().values #Drop zeros based on original counts if not show_zeros: @@ -486,9 +486,9 @@ def cds_obsbars(table, top_obs_bars): df_obsbars[str(ncol)] = 0 ncol += 1 # Other account for filtered taxa (not on top) and left over percentage for the rank without assignment - df_obsbars["others"] = table.total - table.unassigned - df_obsbars.sum(axis=1) - df_obsbars["unassigned"] = table.unassigned - df_obsbars = transform_table(df_obsbars, table.total, "norm", 0) * 100 + df_obsbars["others"] = table.get_total() - table.get_unassigned() - df_obsbars.sum(axis=1) + df_obsbars["unassigned"] = table.get_unassigned() + df_obsbars = transform_table(df_obsbars, table.get_total(), "norm", 0) * 100 df_obsbars["factors"] = df_obsbars.index.to_list() print_df(df_obsbars, "cds_p_obsbars") diff --git a/grimer/config.py b/grimer/config.py index c9114ed..3f3d640 100644 --- a/grimer/config.py +++ b/grimer/config.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import argparse -import sys from scipy.spatial.distance import _METRICS_NAMES from scipy.cluster.hierarchy import _LINKAGE_METHODS @@ -9,39 +8,45 @@ class Config: version = "1.0.0-alpha1" default_rank_name = "default" + output_plots = ["overview", "samples", "heatmap", "correlation"] def __new__(self, argv=None): parser = argparse.ArgumentParser(description='grimer') - parser.add_argument('-i', '--input-file', required=True, type=str, help="Main input table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --tranpose if your file is reversed). First column and first row are used as headers.") - parser.add_argument('-c', '--config', type=str, help="Configuration file") - parser.add_argument('-m', '--metadata-file', type=str, help="Input metadata file in simple tabular format. Sample identifiers will be matched with ones provided by --input-table. QIIME 2 metadata format is also accepted, with categorical and numerical fields.") - parser.add_argument('-t', '--tax', type=str, default=None, help="Define taxonomy to use. By default, do not use any taxonomy.", choices=["ncbi", "gtdb", "silva", "greengenes", "ott"]) - parser.add_argument('-b', '--tax-files', nargs="*", type=str, default=[], help="Taxonomy files. If not provided, will automatically be downloaded.") - parser.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") - parser.add_argument('-r', '--ranks', nargs="*", default=[Config.default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + Config.default_rank_name + "' to use entries from the table directly. Default: " + Config.default_rank_name) - parser.add_argument('-l', '--title', type=str, default="", help="Title to display on the header of the report.") - parser.add_argument('-o', '--output-html', type=str, default="output.html", help="File to output report. Default: output.html") - parser.add_argument('--full-offline', default=False, action='store_true', help="Embed javascript library in the output file. File will be around 1.5MB bigger but also work without internet connection. That way your report will live forever.") + required_group = parser.add_argument_group('required arguments') + required_group.add_argument('-i', '--input-file', required=True, type=str, help="Main input table with counts (Observation table, Count table, Contingency Tables, ...) or .biom file. By default rows contain observations and columns contain samples (use --tranpose if your file is reversed). First column and first row are used as headers.") - table_group = parser.add_argument_group('Table options') - table_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hiearchical multi-level table where the observations headers are separated by the indicated separator characther (usually ';' or '|')") - table_group.add_argument('-y', '--values', default=None, type=str, help="Force 'count' or 'normalized' data parsing. Empty to auto-detect.") - table_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table (if samples are listed on columns and observations on rows)") - table_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.") - table_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on table observations labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") - table_group.add_argument('--sample-replace', nargs="*", type=str, default=[], help="Replace values on table sample labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") + main_group = parser.add_argument_group('main arguments') + main_group.add_argument('-m', '--metadata-file', type=str, help="Input metadata file in simple tabular format with samples in rows and metadata fields in columns. QIIME 2 metadata format is also accepted, with an extra row to define categorical and numerical fields. If not provided and --input-file is a .biom files, will attempt to get metadata from it. ") + main_group.add_argument('-t', '--taxonomy', type=str, default=None, help="Define taxonomy to convert entry and annotate samples. Will automatically download and parse or files can be provided with --tax-files.", choices=["ncbi", "gtdb", "silva", "greengenes", "ott"]) + main_group.add_argument('-b', '--tax-files', nargs="*", type=str, default=[], help="Optional specific taxonomy files to use.") + main_group.add_argument('-r', '--ranks', nargs="*", default=[Config.default_rank_name], type=str, help="Taxonomic ranks to generate visualizations. Use '" + Config.default_rank_name + "' to use entries from the table directly. Default: " + Config.default_rank_name) + main_group.add_argument('-c', '--config', type=str, help="Configuration file with definitions of references, controls and external tools.") - filter_group = parser.add_argument_group('Observation filter options') - filter_group.add_argument('--min-frequency', type=float, help="Define minimum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") - filter_group.add_argument('--max-frequency', type=float, help="Define maximum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") - filter_group.add_argument('--min-count', type=float, help="Define minimum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") - filter_group.add_argument('--max-count', type=float, help="Define maximum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") + output_group = parser.add_argument_group('output arguments') + output_group.add_argument('-g', '--mgnify', default=False, action='store_true', help="Use MGNify data") + output_group.add_argument('-d', '--decontam', default=False, action='store_true', help="Run DECONTAM") + output_group.add_argument('-l', '--title', type=str, default="", help="Title to display on the header of the report.") + output_group.add_argument('-p', '--output-plots', nargs="*", type=str, default=Config.output_plots, help="Plots to generate. Default: " + ",".join(Config.output_plots), choices=Config.output_plots) + output_group.add_argument('-o', '--output-html', type=str, default="output.html", help="File to output report. Default: output.html") + output_group.add_argument('--full-offline', default=False, action='store_true', help="Embed javascript library in the output file. File will be around 1.5MB bigger but also work without internet connection. That way your report will live forever.") - overview_group = parser.add_argument_group('Overview options') - overview_group.add_argument('-g', '--mgnify', default=False, action='store_true', help="Use MGNify data") - overview_group.add_argument('-d', '--decontam', default=False, action='store_true', help="Run DECONTAM") + data_group = parser.add_argument_group('general data options') + data_group.add_argument('-f', '--level-separator', default=None, type=str, help="If provided, consider --input-table to be a hiearchical multi-level table where the observations headers are separated by the indicated separator characther (usually ';' or '|')") + data_group.add_argument('-y', '--values', default=None, type=str, help="Force 'count' or 'normalized' data parsing. Empty to auto-detect.") + data_group.add_argument('-s', '--transpose', default=False, action='store_true', help="Transpose --input-table (if samples are listed on columns and observations on rows)") + data_group.add_argument('-u', '--unassigned-header', nargs="*", type=str, default=None, help="Define one or more header names containing unsassinged/unclassified counts.") + data_group.add_argument('--obs-replace', nargs="*", type=str, default=[], help="Replace values on table observations labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") + data_group.add_argument('--sample-replace', nargs="*", type=str, default=[], help="Replace values on table sample labels/headers (support regex). Example: '_' ' ' will replace underscore with spaces, '^.+__' '' will remove the matching regex.") + data_group.add_argument('-z', '--replace-zeros', type=str, default="1000", help="INT (add 'smallest count'/INT to every raw count), FLOAT (add FLOAT to every raw count). Default: 1000") + data_group.add_argument('--min-frequency', type=float, help="Define minimum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") + data_group.add_argument('--max-frequency', type=float, help="Define maximum number/percentage of samples containing an observation to keep the observation [values between 0-1 for percentage, >1 specific number].") + data_group.add_argument('--min-count', type=float, help="Define minimum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") + data_group.add_argument('--max-count', type=float, help="Define maximum number/percentage of counts to keep an observation [values between 0-1 for percentage, >1 specific number].") + + sample_group = parser.add_argument_group('Samples options') + sample_group.add_argument('-j', '--top-obs-bars', type=int, default=20, help="Top abundant observations to show in the bars.") heatmap_group = parser.add_argument_group('Heatmap and clustering options') heatmap_group.add_argument('-a', '--transformation', type=str, default="log", help="none (counts), norm (percentage), log (log10), clr (centre log ratio). Default: log") @@ -55,9 +60,6 @@ def __new__(self, argv=None): correlation_group = parser.add_argument_group('Correlation options') correlation_group.add_argument('-x', '--top-obs-corr', type=int, default=50, help="Top abundant observations to build the correlationn matrix, based on the avg. percentage counts/sample. 0 for all") - bars_group = parser.add_argument_group('Bars options') - bars_group.add_argument('-j', '--top-obs-bars', type=int, default=20, help="Top abundant observations to show in the bars.") - parser.add_argument('-v', '--version', action='version', version='%(prog)s ' + Config.version) parser.add_argument('-D', '--debug', default=False, action='store_true', help=argparse.SUPPRESS) diff --git a/grimer/func.py b/grimer/func.py index 4d3ce11..bbb8b00 100644 --- a/grimer/func.py +++ b/grimer/func.py @@ -71,8 +71,6 @@ def parse_taxonomy(taxonomy, tax_files): def parse_table(args, tax): - table = None - # Specific default params if biom file is provided if args.input_file.endswith(".biom"): if not args.level_separator: @@ -82,6 +80,9 @@ def parse_table(args, tax): # Read and return full table with separated total and unassigned counts (sharing same index) table_df, total, unassigned = parse_input_file(args.input_file, args.unassigned_header, args.transpose, args.sample_replace) + if table_df.empty: + raise Exception("Error parsing input file") + # Define if table is already normalized (0-100) or has count data if args.values == "count": normalized = False @@ -107,8 +108,7 @@ def parse_table(args, tax): ranked_tables, lineage = parse_single_table(table_df, args.ranks, tax, Config.default_rank_name) if not ranked_tables: - print_log("Could not parse input table") - return 1 + raise Exception("Error parsing input file") table = Table(table_df.index, total, unassigned, lineage, normalized, replace_zero_value) @@ -132,9 +132,10 @@ def parse_table(args, tax): print_log("Total valid observations: " + str(len(table.observations(r)))) print_log("") + if not normalized: - print_log("Total assigned (counts): " + str(table.total.sum() - table.unassigned.sum())) - print_log("Total unassigned (counts): " + str(table.unassigned.sum())) + print_log("Total assigned (counts): " + str(table.get_total().sum() - table.get_unassigned().sum())) + print_log("Total unassigned (counts): " + str(table.get_unassigned().sum())) print_log("") return table @@ -594,7 +595,7 @@ def run_decontam(run_decontam, cfg, table, metadata, control_samples): elif not table.normalized: # Use total from table print_log("No concentration provided, using total counts as concentration (frequency for DECONTAM)") - df_decontam["concentration"] = table.total + df_decontam["concentration"] = table.get_total() else: print_log("Cannot run DECONTAM without defined concentration and normalized input values, skipping") return None @@ -641,7 +642,7 @@ def run_decontam(run_decontam, cfg, table, metadata, control_samples): else: # normalize and write temporary table for each rank if not table.normalized: - transform_table(table.data[rank], table.total[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) + transform_table(table.data[rank], table.get_total()[table.data[rank].index], "norm", 0).to_csv(out_table, sep="\t", header=True, index=True) else: table.data[rank].to_csv(out_table, sep="\t", header=True, index=True) @@ -671,7 +672,7 @@ def run_hclustering(table, linkage_methods, linkage_metrics, transformation, ski for rank in table.ranks(): # Get .values of transform, numpy array - matrix = transform_table(table.data[rank], table.total, transformation, table.zerorep).values + matrix = transform_table(table.data[rank], table.get_total(), transformation, table.zerorep).values hcluster[rank] = {} dendro[rank] = {} diff --git a/grimer/grimer.py b/grimer/grimer.py index edac13d..b99bf0a 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -53,16 +53,20 @@ def main(argv=sys.argv[1:]): cfg = parse_config_file(args.config) print_log("- Parsing taxonomy") - tax = parse_taxonomy(args.tax, args.tax_files) + tax = parse_taxonomy(args.taxonomy, args.tax_files) print_log("- Parsing input table") - table = parse_table(args, tax) + try: + table = parse_table(args, tax) + except Exception as e: + print(e) + return 1 print_log("- Parsing metadata") metadata = parse_metadata(args, table) print_log("- Parsing references") - references = parse_references(cfg, tax, args.tax, table.ranks()) + references = parse_references(cfg, tax, args.taxonomy, table.ranks()) print_log("- Parsing controls") controls, control_samples = parse_controls(cfg, table) @@ -295,7 +299,7 @@ def main(argv=sys.argv[1:]): script_dir, _ = os.path.split(__file__) logo_path = os.path.join(script_dir, "img", "logo.png") - final_layout = make_layout(ele, sizes, Config.version, logo_path, args.title) + final_layout = make_layout(ele, sizes, Config.version, logo_path, args.title, args.output_plots) template = include_scripts({os.path.join(script_dir, "js", "func.js"): "script", os.path.join(script_dir, "js", "popup.js"): "script", diff --git a/grimer/layout.py b/grimer/layout.py index 26cd445..6991520 100644 --- a/grimer/layout.py +++ b/grimer/layout.py @@ -3,116 +3,107 @@ import base64 -def make_layout(ele, sizes, version, logo_path, title): - - filterwidgets = column(ele["obstable"]["wid"]["frequency_spinner"], - ele["obstable"]["wid"]["counts_perc_avg_spinner"], - ele["obstable"]["wid"]["total_counts_spinner"], - ele["obstable"]["wid"]["name_multichoice"], - ele["obstable"]["wid"]["help_button"]) - - filterwidgetstabs = Tabs(tabs=[Panel(child=filterwidgets, title="Filter")], - sizing_mode="fixed", - height=sizes["overview_top_panel_height"] + 20, - width=sizes["overview_top_panel_width_left"]) - - info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] - - if ele["references"]["fig"]: - info_tabs.append(Panel(child=column(ele["references"]["fig"], - row(ele["references"]["wid"]["references_select"], - ele["references"]["wid"]["help_button"]) - ), title="References")) - - if ele["mgnify"]["fig"]: - info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], - row(ele["mgnify"]["wid"]["biome_spinner"], - ele["mgnify"]["wid"]["help_button"]) - ), title="MGNify")) - - if ele["decontam"]["fig"]: - info_tabs.append(Panel(child=column(ele["decontam"]["fig"], - row(ele["decontam"]["wid"]["pscore_text"], - ele["decontam"]["wid"]["pscore_input"], - ele["decontam"]["wid"]["help_button"]) - ), title="DECONTAM")) - - infotabs = Tabs(tabs=info_tabs, - sizing_mode="fixed", - height=sizes["overview_top_panel_height"] + 20, - width=sizes["overview_top_panel_width_right"]) - - row_obstable = row(filterwidgetstabs, - ele["obstable"]["fig"], - infotabs, - sizing_mode="stretch_width") - - row_barpot = column(row(ele["samplebars"]["fig"]), - row(ele["samplebars"]["wid"]["y1_select"], - ele["samplebars"]["wid"]["annotbar_rank_select"], - ele["samplebars"]["wid"]["annotbar_select"], - ele["samplebars"]["wid"]["groupby1_select"], - ele["samplebars"]["wid"]["groupby2_select"], - ele["samplebars"]["wid"]["sort_select"], - ele["samplebars"]["wid"]["y2_select"], - ele["samplebars"]["wid"]["help_button"]), - ele["samplebars"]["wid"]["toggle_label"]) - - selectwidgets = column(ele["sampletable"]["wid"]["total_counts_spinner"], - ele["sampletable"]["wid"]["assigned_spinner"], - ele["sampletable"]["wid"]["metadata_multichoice"], - ele["sampletable"]["wid"]["help_button"]) - - selectwidgetstabs = Tabs(tabs=[Panel(child=selectwidgets, title="Select")], - sizing_mode="fixed", - height=sizes["overview_top_panel_height"] + 20, - width=sizes["overview_top_panel_width_left"]) - - row_sampletable = row(selectwidgetstabs, - ele["sampletable"]["fig"], - sizing_mode="stretch_width") - - row_obsbars = column(row(ele["obsbars"]["fig"]), - row(ele["obsbars"]["wid"]["rank_select"], - ele["obsbars"]["wid"]["groupby1_select"], - ele["obsbars"]["wid"]["groupby2_select"], - ele["obsbars"]["wid"]["sort_select"], - ele["obsbars"]["wid"]["help_button"]), - ele["obsbars"]["wid"]["toggle_label"]) - - row_heatmap = gridplot([[ele["heatmap"]["fig"], ele["dendroy"]["fig"], ele["metadata"]["fig"]], - [ele["dendrox"]["fig"]], - [ele["annotations"]["fig"], None, ele["heatmap"]["wid"]["help_button"]]], - toolbar_location='right', - merge_tools=True) - - row_heatmap_widgets = row(column(ele["heatmap"]["wid"]["rank_select"], - ele["heatmap"]["wid"]["toggle_label"], - width=300), - row(column(ele["heatmap"]["wid"]["x_groupby_select"], - ele["heatmap"]["wid"]["x_sort_select"]), - column(ele["heatmap"]["wid"]["y_groupby_select"], - ele["heatmap"]["wid"]["y_sort_select"]), - sizing_mode="stretch_width"), - column(ele["metadata"]["wid"]["metadata_multiselect"], - ele["metadata"]["wid"]["toggle_legend"], - sizing_mode="stretch_height", - width=300)) - - row_correlation = row(column(ele["correlation"]["wid"]["rank_select"], - ele["correlation"]["wid"]["neg_slider"], - ele["correlation"]["wid"]["pos_slider"], - ele["correlation"]["wid"]["toggle_label"], - ele["correlation"]["wid"]["help_button"]), - ele["correlation"]["fig"]) +def make_layout(ele, sizes, version, logo_path, title, output_plots): main_panels = [] - main_panels.append(Panel(child=column(row_obstable, row_barpot, sizing_mode="stretch_width"), title="Overview")) - main_panels.append(Panel(child=column(row_sampletable, row_obsbars, sizing_mode="stretch_width"), title="Samples")) - main_panels.append(Panel(child=column(row_heatmap, row_heatmap_widgets, sizing_mode="stretch_width"), title="Heatmap")) - main_panels.append(Panel(child=column(row_correlation, sizing_mode="stretch_width"), title="Correlation")) - main_tab = Tabs(tabs=main_panels) + if "overview" in output_plots: + filterwidgets = column(ele["obstable"]["wid"]["frequency_spinner"], + ele["obstable"]["wid"]["counts_perc_avg_spinner"], + ele["obstable"]["wid"]["total_counts_spinner"], + ele["obstable"]["wid"]["name_multichoice"], + ele["obstable"]["wid"]["help_button"]) + filterwidgetstabs = Tabs(tabs=[Panel(child=filterwidgets, title="Filter")], + sizing_mode="fixed", + height=sizes["overview_top_panel_height"] + 20, + width=sizes["overview_top_panel_width_left"]) + info_tabs = [Panel(child=ele["infopanel"]["textarea"], title="Info")] + if ele["references"]["fig"]: + info_tabs.append(Panel(child=column(ele["references"]["fig"], + row(ele["references"]["wid"]["references_select"], + ele["references"]["wid"]["help_button"]) + ), title="References")) + if ele["mgnify"]["fig"]: + info_tabs.append(Panel(child=column(ele["mgnify"]["fig"], + row(ele["mgnify"]["wid"]["biome_spinner"], + ele["mgnify"]["wid"]["help_button"]) + ), title="MGNify")) + if ele["decontam"]["fig"]: + info_tabs.append(Panel(child=column(ele["decontam"]["fig"], + row(ele["decontam"]["wid"]["pscore_text"], + ele["decontam"]["wid"]["pscore_input"], + ele["decontam"]["wid"]["help_button"]) + ), title="DECONTAM")) + infotabs = Tabs(tabs=info_tabs, + sizing_mode="fixed", + height=sizes["overview_top_panel_height"] + 20, + width=sizes["overview_top_panel_width_right"]) + row_obstable = row(filterwidgetstabs, + ele["obstable"]["fig"], + infotabs, + sizing_mode="stretch_width") + row_barpot = column(row(ele["samplebars"]["fig"]), + row(ele["samplebars"]["wid"]["y1_select"], + ele["samplebars"]["wid"]["annotbar_rank_select"], + ele["samplebars"]["wid"]["annotbar_select"], + ele["samplebars"]["wid"]["groupby1_select"], + ele["samplebars"]["wid"]["groupby2_select"], + ele["samplebars"]["wid"]["sort_select"], + ele["samplebars"]["wid"]["y2_select"], + ele["samplebars"]["wid"]["help_button"]), + ele["samplebars"]["wid"]["toggle_label"]) + main_panels.append(Panel(child=column(row_obstable, row_barpot, sizing_mode="stretch_width"), title="Overview")) + + if "samples" in output_plots: + selectwidgets = column(ele["sampletable"]["wid"]["total_counts_spinner"], + ele["sampletable"]["wid"]["assigned_spinner"], + ele["sampletable"]["wid"]["metadata_multichoice"], + ele["sampletable"]["wid"]["help_button"]) + selectwidgetstabs = Tabs(tabs=[Panel(child=selectwidgets, title="Select")], + sizing_mode="fixed", + height=sizes["overview_top_panel_height"] + 20, + width=sizes["overview_top_panel_width_left"]) + row_sampletable = row(selectwidgetstabs, + ele["sampletable"]["fig"], + sizing_mode="stretch_width") + row_obsbars = column(row(ele["obsbars"]["fig"]), + row(ele["obsbars"]["wid"]["rank_select"], + ele["obsbars"]["wid"]["groupby1_select"], + ele["obsbars"]["wid"]["groupby2_select"], + ele["obsbars"]["wid"]["sort_select"], + ele["obsbars"]["wid"]["help_button"]), + ele["obsbars"]["wid"]["toggle_label"]) + main_panels.append(Panel(child=column(row_sampletable, row_obsbars, sizing_mode="stretch_width"), title="Samples")) + + if "heatmap" in output_plots: + row_heatmap = gridplot([[ele["heatmap"]["fig"], ele["dendroy"]["fig"], ele["metadata"]["fig"]], + [ele["dendrox"]["fig"]], + [ele["annotations"]["fig"], None, ele["heatmap"]["wid"]["help_button"]]], + toolbar_location='right', + merge_tools=True) + row_heatmap_widgets = row(column(ele["heatmap"]["wid"]["rank_select"], + ele["heatmap"]["wid"]["toggle_label"], + width=300), + row(column(ele["heatmap"]["wid"]["x_groupby_select"], + ele["heatmap"]["wid"]["x_sort_select"]), + column(ele["heatmap"]["wid"]["y_groupby_select"], + ele["heatmap"]["wid"]["y_sort_select"]), + sizing_mode="stretch_width"), + column(ele["metadata"]["wid"]["metadata_multiselect"], + ele["metadata"]["wid"]["toggle_legend"], + sizing_mode="stretch_height", + width=300)) + main_panels.append(Panel(child=column(row_heatmap, row_heatmap_widgets, sizing_mode="stretch_width"), title="Heatmap")) + + if "correlation" in output_plots: + row_correlation = row(column(ele["correlation"]["wid"]["rank_select"], + ele["correlation"]["wid"]["neg_slider"], + ele["correlation"]["wid"]["pos_slider"], + ele["correlation"]["wid"]["toggle_label"], + ele["correlation"]["wid"]["help_button"]), + ele["correlation"]["fig"]) + main_panels.append(Panel(child=column(row_correlation, sizing_mode="stretch_width"), title="Correlation")) + main_tab = Tabs(tabs=main_panels) logo_base64 = base64.b64encode(open(logo_path, 'rb').read()) # encode to base64 logo_base64 = logo_base64.decode() # convert to string logo_div = Div(text='' + 'v' + version + '', width=300, height=40, sizing_mode="fixed") From b83163dcff894df076862e729a4df988fe4a9fd3 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Sat, 5 Mar 2022 12:30:27 +0100 Subject: [PATCH 47/50] no mgnify class --- grimer/cds.py | 2 +- grimer/func.py | 14 +++++++++++--- grimer/grimer.py | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/grimer/cds.py b/grimer/cds.py index af68fa7..ce8ce1d 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -503,7 +503,7 @@ def cds_mgnify(mgnify, table, tax): # Match uids (taxid or names) from input and keep only found elements uids = [txid for rank in table.ranks() for txid in table.observations(rank)] - df_tmp = mgnify.data[mgnify.data['taxa'].isin(uids)] + df_tmp = mgnify[mgnify['taxa'].isin(uids)] # reset index to properly concate later with biome lineages df_tmp.reset_index(drop=True, inplace=True) diff --git a/grimer/func.py b/grimer/func.py index bbb8b00..a9df6b8 100644 --- a/grimer/func.py +++ b/grimer/func.py @@ -11,7 +11,6 @@ from grimer.decontam import Decontam from grimer.metadata import Metadata from grimer.reference import Reference -from grimer.mgnify import MGnify from grimer.table import Table # Bokeh @@ -219,12 +218,21 @@ def parse_mgnify(run_mgnify, cfg, tax, ranks): if run_mgnify: if cfg is not None and "mgnify" in cfg["external"]: try: - mgnify = MGnify(cfg["external"]["mgnify"], ranks=ranks) + mgnify = pd.read_table(cfg["external"]["mgnify"], header=None, names=["rank", "taxa", "biome", "count"]) except Exception as e: print_log("Failed parsing MGnify database file [" + cfg["external"]["mgnify"] + "], skipping") print_log(str(e)) + # Filter to keep only used ranks, if provided + if ranks: + mgnify = mgnify.loc[mgnify['rank'].isin(ranks)] + mgnify.reset_index(drop=True, inplace=True) + # Convert taxids if tax is provided if tax: - mgnify.update_taxids(update_tax_nodes([tuple(x) for x in mgnify.data[["rank", "taxa"]].to_numpy()], tax)) + updated_nodes = update_tax_nodes([tuple(x) for x in mgnify.data[["rank", "taxa"]].to_numpy()], tax) + mgnify["taxa"] = mgnify[["rank", "taxa"]].apply(lambda rt: updated_nodes[(rt[0], rt[1])] if updated_nodes[(rt[0], rt[1])] is not None else rt[1], axis=1) + if mgnify.empty: + mgnify = None + print_log("No matches with MGnify database, skipping") else: print_log("Not defined in the configuration file, skipping") else: diff --git a/grimer/grimer.py b/grimer/grimer.py index b99bf0a..467b530 100755 --- a/grimer/grimer.py +++ b/grimer/grimer.py @@ -38,6 +38,7 @@ def main(argv=sys.argv[1:]): _debug = args.debug # 1) Load data/analysis + # If not parsed, skipped or error, var is None cfg = None tax = None table = None @@ -106,7 +107,7 @@ def main(argv=sys.argv[1:]): # {x: [min,max], y_cont: [None,None], y_noncont: [None,None]} cds_p_decontam_models = cds_plot_decontam_models(decontam) if decontam else None # stacked: index (taxa, level, lineage), count, perc - cds_p_mgnify = cds_mgnify(mgnify, table, tax) if mgnify else None + cds_p_mgnify = cds_mgnify(mgnify, table, tax) if mgnify is not None else None # stacked: index (repeated sample-ids), obs, rank, ov, tv cds_p_heatmap = cds_heatmap(table, args.transformation, args.show_zeros) # matrix: index (unique sample-ids), md0, md1, ..., md(args.metadata_cols) -> (metadata field, metadata values) From 5ba67bfc6b3101a336c208567455045ee0748274 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Sat, 5 Mar 2022 12:47:15 +0100 Subject: [PATCH 48/50] fix bugs types --- grimer/callbacks.py | 3 +-- grimer/cds.py | 6 +++--- grimer/decontam.py | 2 +- grimer/func.py | 2 +- grimer/plots.py | 16 +++++++--------- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 7b07b7a..1e76efe 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -982,8 +982,7 @@ def link_obsbars_widgets(ele, cds_p_obsbars, dict_d_topobs, dict_d_sampleobs, cd } // Change value of the factors on the obsbars cds - cds_p_obsbars.data["factors"] = factors; - + cds_p_obsbars.data["factors"] = factors; } // Plot sorted factors diff --git a/grimer/cds.py b/grimer/cds.py index ce8ce1d..8cae9d4 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -352,8 +352,8 @@ def cds_heatmap(table, transformation, show_zeros): # initialize factors only for first rank #stacked_rank_df["factors_sample"] = stacked_rank_df.index #stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] - stacked_rank_df["factors_sample"] = stacked_rank_df.index if i==0 else "" - stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] if i==0 else "" + stacked_rank_df["factors_sample"] = stacked_rank_df.index if i == 0 else "" + stacked_rank_df["factors_obs"] = stacked_rank_df["obs"] if i == 0 else "" df_heatmap = pd.concat([df_heatmap, stacked_rank_df], axis=0) @@ -534,7 +534,7 @@ def cds_mgnify(mgnify, table, tax): # Calculate angle for each taxa/level for wedges total_taxa_level = df_biome.groupby("taxa").sum().to_dict()["count"] - df_biome["angle"] = (df_biome['count'] / df_biome['taxa'].map(total_taxa_level)) * (2*pi) + df_biome["angle"] = (df_biome['count'] / df_biome['taxa'].map(total_taxa_level)) * (2 * pi) # Group to the final df df_mgnify = pd.concat([df_mgnify, df_biome], axis=0, ignore_index=True) diff --git a/grimer/decontam.py b/grimer/decontam.py index 8f91cd0..5b2c7fa 100644 --- a/grimer/decontam.py +++ b/grimer/decontam.py @@ -43,5 +43,5 @@ def get_pscore(self, rank, idx): def get_contaminant_list(self): clist = [] for r in self.rank: - clist.extend(self.rank[r].index[self.rank[r]["contaminant"]==True].to_list()) + clist.extend(self.rank[r].index[self.rank[r]["contaminant"] == True].to_list()) return clist diff --git a/grimer/func.py b/grimer/func.py index a9df6b8..3738403 100644 --- a/grimer/func.py +++ b/grimer/func.py @@ -228,7 +228,7 @@ def parse_mgnify(run_mgnify, cfg, tax, ranks): mgnify.reset_index(drop=True, inplace=True) # Convert taxids if tax is provided if tax: - updated_nodes = update_tax_nodes([tuple(x) for x in mgnify.data[["rank", "taxa"]].to_numpy()], tax) + updated_nodes = update_tax_nodes([tuple(x) for x in mgnify[["rank", "taxa"]].to_numpy()], tax) mgnify["taxa"] = mgnify[["rank", "taxa"]].apply(lambda rt: updated_nodes[(rt[0], rt[1])] if updated_nodes[(rt[0], rt[1])] is not None else rt[1], axis=1) if mgnify.empty: mgnify = None diff --git a/grimer/plots.py b/grimer/plots.py index f5ed97d..c4143cd 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -1,12 +1,12 @@ import markdown # Bokeh -from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, CustomJSTransform, FactorRange, FixedTicker, FuncTickFormatter, HoverTool, Legend, LegendItem, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, PrintfTickFormatter, Range1d, RangeSlider, Select, Spacer, Spinner, Tabs, TextAreaInput, TextInput +from bokeh.models import AdaptiveTicker, Button, CategoricalColorMapper, CDSView, CheckboxGroup, ColorBar, ColumnDataSource, CustomJS, CustomJSHover, FactorRange, FixedTicker, FuncTickFormatter, HoverTool, Legend, LegendItem, LinearAxis, LinearColorMapper, MultiChoice, MultiSelect, NumberFormatter, Panel, Paragraph, Range1d, RangeSlider, Select, Spacer, Spinner, Tabs, TextAreaInput, TextInput from bokeh.models.filters import IndexFilter, GroupFilter from bokeh.models.widgets import DataTable, TableColumn from bokeh.palettes import Blues, Dark2, Magma256, Reds from bokeh.plotting import figure -from bokeh.transform import cumsum, factor_cmap, transform +from bokeh.transform import cumsum, factor_cmap from grimer.func import format_js_toString, make_color_palette @@ -195,7 +195,7 @@ def plot_obsbars_widgets(ranks, metadata, dict_d_topobs, dict_d_taxname, top_obs sort_options["Default"].append(("col|others", "others")) sort_options["Default"].append(("col|unassigned", "unassigned")) - sort_options["Observation"] = [("col|" + str(i), str(i+1)) for i in range(top_obs_bars)] + sort_options["Observation"] = [("col|" + str(i), str(i + 1)) for i in range(top_obs_bars)] sort_options["Numeric Metadata"] = [] if metadata: @@ -322,7 +322,6 @@ def plot_obstable(sizes, cds_m_obstable, ranks, references, controls): for ctrl_name in controls.keys(): table_cols.append(TableColumn(field="col|" + ctrl_name, title="(F) " + ctrl_name, default_sort="descending", formatter=NumberFormatter(format="0.00%"))) - if "col|decontam" in cds_m_obstable.data: table_cols.append(TableColumn(field="col|decontam", title="DECONTAM", default_sort="descending")) @@ -430,7 +429,7 @@ def plot_sampletable_widgets(sizes, max_count_samples, metadata): metadata_multichoice = Spacer() help_text = """ -Summary of samples. Entries selected in the table are shown in the barplot below. +Summary of samples. Entries selected in the table are shown in the barplot below. Widgets can select batches of entries in the table by multiple criteria. @@ -457,7 +456,7 @@ def plot_decontam(sizes, cds_p_decontam, cds_p_decontam_lines, min_obs_perc): sizing_mode="stretch_width", tools="save") - palette = make_color_palette(2) #Control, Sample + palette = make_color_palette(2) # Control, Sample factors = list(sorted(set(cds_p_decontam.data["controls"]), reverse=True)) # Add legend on top decontam_fig.add_layout(Legend(), 'above') @@ -642,7 +641,7 @@ def plot_mgnify(sizes, cds_p_mgnify): def plot_mgnify_widgets(): - biome_spinner = Spinner(title="Biome level", low=1, high=5, value=1, step=1, width=100, height=50)#, orientation="horizontal") + biome_spinner = Spinner(title="Biome level", low=1, high=5, value=1, step=1, width=100, height=50) # orientation="horizontal") help_text = """ Pie chart with the number of occurrences of the selected taxa in the table by environment (biome) in other microbiome studies analyzed and publicly available at the [MGNify](https://www.ebi.ac.uk/metagenomics) [1] resource. @@ -920,7 +919,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada metadata_fig.add_layout(legend_colorbars[md_header], 'right') metadata_fig.xaxis.axis_label = "metadata" - metadata_fig.xaxis.major_label_orientation = "horizontal" #"vertical" + metadata_fig.xaxis.major_label_orientation = "horizontal" metadata_fig.xaxis.major_label_text_font_size = "11px" metadata_fig.xaxis.minor_tick_line_color = None metadata_fig.xgrid.grid_line_color = None @@ -1111,4 +1110,3 @@ def help_button(title: str="", text: str="", align: str="end"): hb.js_on_click(CustomJS(code="pop.open('" + title + "', '" + html_text + "');")) return hb - From 9051dcb9a3f339fc052f623b84631a70c6e60dc5 Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Mon, 7 Mar 2022 10:40:52 +0100 Subject: [PATCH 49/50] last fixes --- grimer/callbacks.py | 2 +- grimer/cds.py | 5 ++++- grimer/plots.py | 23 +++++++++++++++-------- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/grimer/callbacks.py b/grimer/callbacks.py index 1e76efe..5db1d14 100644 --- a/grimer/callbacks.py +++ b/grimer/callbacks.py @@ -710,7 +710,7 @@ def link_metadata_widgets(ele, cds_p_metadata, cds_d_metadata, max_metadata_cols var x_factors = []; var empty_y_values = new Array(index_len); - for (var i = 0; i < index_len; ++i) empty_y_values[i]=["", ""]; + for (var i = 0; i < index_len; ++i) empty_y_values[i]=""; // hide all legends for (let md_header in legend_colorbars) legend_colorbars[md_header].visible = false; diff --git a/grimer/cds.py b/grimer/cds.py index 8cae9d4..2a0d1a8 100644 --- a/grimer/cds.py +++ b/grimer/cds.py @@ -53,7 +53,7 @@ def cds_annotations(table, references, controls, decontam, control_samples): # Generate a DataFrame to use as source in tables df_rank = pd.DataFrame(index=table.observations(rank)) - if decontam: + if decontam is not None: contaminants = decontam.get_contaminants(rank, df_rank.index).values if contaminants.any(): df_rank["decontam"] = decontam.get_pscore(rank, df_rank.index)[contaminants] @@ -268,6 +268,9 @@ def cds_plot_metadata(metadata, max_metadata_cols): df_plot_md["1"] = [(first_field, format_js_toString(md_value)) for md_value in metadata.get_col(first_field)] + # Fill with empty strings to match js output when not selected + df_plot_md.fillna("", inplace=True) + print_df(df_plot_md, "cds_p_metadata") return ColumnDataSource(df_plot_md) diff --git a/grimer/plots.py b/grimer/plots.py index c4143cd..97aa742 100644 --- a/grimer/plots.py +++ b/grimer/plots.py @@ -651,7 +651,7 @@ def plot_mgnify_widgets(): """ return {"biome_spinner": biome_spinner, - "help_button": help_button(title="MGNify", text=help_text)} + "help_button": help_button(title="MGnify", text=help_text)} def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_taxname): @@ -697,7 +697,8 @@ def plot_heatmap(table, cds_p_heatmap, tools_heatmap, transformation, dict_d_tax location="center", orientation="vertical", major_label_text_align="left", - major_label_text_font_size="9px") + major_label_text_font_size="9px", + title=transformation) heatmap.add_layout(color_bar, 'left') # Convert taxid ticks to taxa names on client-side @@ -771,13 +772,19 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, references, co toggle_label = CheckboxGroup(labels=["Show observations labels", "Show samples label"], active=[]) help_text = """ -The heatmap shows [transformed] values from the input table (color bar on top). If taxonomy is provided, one heatmap for each taxonomic rank is generated. +***Heatmap*** + +The heatmap shows [transformed] values from the input table (color bar on top). Values on both axis can be independently clustered, grouped or sorted. +Hierarchical clustering uses [scipy linkage](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html). Dendrograms will be plotted in the panels around the heatmap if clustering is selected. + +***Metadata*** -Values on axis can be independently clustered or sorted. Hierarchical clustering is done with [scipy linkage](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html). If clustering is selected, dendrograms will be plotted in the panels around the heatmap. +The right-most panel will show metadata values related to each sample (y-axis). Colors are automatically generated for categorical (distinct colors) and numeric (sequential colors) fields. Multiple metadata fields can be select with Ctrl + click in the metadata selection widget. -The right-most panel will show metadata values related to each sample (y-axis), if provided. Colors are automatically generated for categorical (distinct colors) and numeric (sequential colors) fields. Multiple metadata fields can be select with Ctrl + click in the metadata selection widget. +***Annotations*** -The bottom-most panel shows annotations for each observation/taxa values (x-axis). +The bottom-most panel shows annotations for each observation values (x-axis). Values are transformed and normalized to a 0-1 scale. References values are are normalized by max. occurrence in the selected rank. +Controls values are their frequency in the specific control annotation. Decontam values are p-scores normalized, with higher values (1) representing low p-scores. The metadata and annotation plots are automatically sorted to reflect the clustering/sort of the heatmap. @@ -790,7 +797,7 @@ def plot_heatmap_widgets(ranks, linkage_methods, linkage_metrics, references, co "y_groupby_select": y_groupby_select, "y_sort_select": y_sort_select, "toggle_label": toggle_label, - "help_button": help_button(title="Heatmap/Clustering", text=help_text)} + "help_button": help_button(title="Heatmap", text=help_text)} def plot_dendrogram(heatmap, tools_heatmap, cds_p_dendro_x, cds_p_dendro_y): @@ -895,7 +902,7 @@ def plot_metadata(heatmap, tools_heatmap, metadata, cds_d_metadata, cds_p_metada metadata_colormap = CategoricalColorMapper(palette=palette, factors=factors) # Custom tooltip to show metadata field and value - md_custom = CustomJSHover(code='return value[0] ? "(" + value[0] + ") " + value[1] : "";') + md_custom = CustomJSHover(code='return value ? "(" + value[0] + ") " + value[1] : "";') tooltips = [('Sample', '@index')] formatters = {} for col in cols: From 4605afb22c56f0ef690b5426f9c04317b9ec07eb Mon Sep 17 00:00:00 2001 From: "Vitor C. Piro" Date: Mon, 7 Mar 2022 11:03:06 +0100 Subject: [PATCH 50/50] updated README --- README.md | 27 +++++++++++++++++++++------ grimer/mgnify.py | 25 ------------------------- 2 files changed, 21 insertions(+), 31 deletions(-) delete mode 100644 grimer/mgnify.py diff --git a/README.md b/README.md index 732159a..5c7e69a 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![GRIMER](grimer/img/logo.png) -GRIMER automates analysis and reports an offline and interactive dashboard integrating annotation, taxonomy and metadata to analyse microbiome studies and detect contamination. +GRIMER perform analysis of microbiome data and generates a portable and interactive dashboard integrating annotation, taxonomy and metadata. ## Examples @@ -22,19 +22,34 @@ grimer -h ## Usage -### Basic +### Tab-separated input table ```bash -grimer -i input_table.tsv -m metadata.tsv -c config/default.yaml +grimer -i input_table.tsv +``` + +### BIOM file +```bash +grimer -i myfile.biom +``` + +### Tab-separated input table with taxonomic annotated observations (e.g. sk__Bacteria;k__;p__Actinobacteria;c__Actinobacteria...) +```bash +grimer -i input_table.tsv -f ";" +``` + +### Tab-separated input table with metadata +```bash +grimer -i input_table.tsv -m metadata.tsv ``` ### With taxonomy integration (ncbi) ```bash -grimer -i input_table.tsv -m metadata.tsv -c config/default.yaml -t ncbi #optional -b taxdump.tar.gz +grimer -i input_table.tsv -m metadata.tsv -t ncbi #optional -b taxdump.tar.gz ``` -### With DECONTAM and MGnify annotations +### With configuration file to setup external tools, references and annotations ```bash -grimer -i input_table.tsv -m metadata.tsv -c config/default.yaml -d -g +grimer -i input_table.tsv -m metadata.tsv -t ncbi -c config/default.yaml -d -g ``` ### List all options diff --git a/grimer/mgnify.py b/grimer/mgnify.py deleted file mode 100644 index db8b974..0000000 --- a/grimer/mgnify.py +++ /dev/null @@ -1,25 +0,0 @@ -import pandas as pd - - -class MGnify: - - def __init__(self, mgnify_file, ranks: list=[]): - self.data = self.parse(mgnify_file, ranks) - - def __repr__(self): - args = ['{}={}'.format(k, repr(v)) for (k, v) in vars(self).items()] - return 'MGnify({})'.format(', '.join(args)) - - def parse(self, file, ranks): - mgnify_df = pd.read_table(file, header=None, names=["rank", "taxa", "biome", "count"]) - # Filter by ranks if provided - if ranks: - mgnify_df = mgnify_df.loc[mgnify_df['rank'].isin(ranks)] - mgnify_df.reset_index(drop=True, inplace=True) - - #mgnify_df.drop(columns="rank", inplace=True) - return mgnify_df - - def update_taxids(self, taxid_updated): - # Update taxonomy to taxid or keep name if not available - self.data["taxa"] = self.data[["rank", "taxa"]].apply(lambda rt: taxid_updated[(rt[0], rt[1])] if taxid_updated[(rt[0], rt[1])] is not None else rt[1], axis=1)