diff --git a/.gitignore b/.gitignore index 1ad710f..aaed6f2 100644 --- a/.gitignore +++ b/.gitignore @@ -16,10 +16,16 @@ src/docs/*.html src/*.html src/notebooks/**/*.html src/*-listing.json +renv/ +renv.lock +.Rhistory *.parquet !**/example-data/**/*.parquet *.sig ko*.pathview.png src/notebooks/R Examples/*.tsv -src/notebooks/R Examples/*.txt \ No newline at end of file +src/notebooks/R Examples/*.txt +.Rproj.user +.RData +.Rprofile diff --git a/src/notebooks/R Mia Examples/Comparative-Metagenomics.ipynb b/src/notebooks/R Mia Examples/Comparative-Metagenomics.ipynb new file mode 100644 index 0000000..f299d81 --- /dev/null +++ b/src/notebooks/R Mia Examples/Comparative-Metagenomics.ipynb @@ -0,0 +1,902 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: \"Comparative Metagenomics\"\n", + "author:\n", + " - name: Sam Hillman\n", + " affiliation:\n", + " - id: mia\n", + " name: Department of Computing, University of Turku, Finland\n", + "toc: true\n", + "number-sections: true\n", + "categories: [R]\n", + "execute: \n", + " eval: true\n", + "---\n", + "\n", + "::: {style=\"max-width:1200px\"}\n", + "![](./../_resources/mgnify_logo.png)\n", + "\n", + "# Comparative metagenomics\n", + "# Normalization methods, alpha & beta diversity, and differentially abundant taxa\n", + "\n", + "In this notebook we aim to demonstrate how the MGnifyR tool can be used to fetch \n", + "data and metadata of a MGnify metagenomic analysis \n", + "\n", + "Then we show how to generate diversity metrics for comparative metagenomics using\n", + "taxonomic profiles. \n", + "\n", + "Finally, we use the `mikropml` package to detect differentially abundant taxa and\n", + "to build an ML classification model. \n", + "\n", + "It's important to note, we're using methods and protocols that align with the \n", + "[mia](https://microbiome.github.io/mia/) package, which can be read about further \n", + "in the [OMA](https://microbiome.github.io/OMA/docs/devel/) guides.\n", + "\n", + "[MGnifyR](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html) \n", + "is a library that provides a set of tools for easily accessing and processing \n", + "MGnify data in R, making queries to MGnify databases through the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1/). \n", + "\n", + "The benefit of MGnifyR is that data can either be fetched in tsv format or be directly \n", + "combined in a [TreeSummarizedExperiment](https://www.bioconductor.org/packages/release/bioc/vignettes/TreeSummarizedExperiment/inst/doc/Introduction_to_treeSummarizedExperiment.html)\n", + "object to run an analysis in a custom workflow.\n", + "\n", + "`TreeSummarizedExperiment` is supported by the [mia bioconductor package](https://microbiome.github.io/mia/), \n", + "which is dedicated to microbiome analysis.\n", + "\n", + "This is an interactive code notebook (a Jupyter Notebook). To run this code, \n", + "click into each cell and press the ▶ button in the top toolbar, or press `shift+enter`.\n", + "\n", + "------------------------------------------------------------------------\n", + "\n", + "```{r}\n", + "# Loading libraries\n", + "suppressMessages({\n", + " library(ggplot2)\n", + " library(MGnifyR)\n", + " library(tidyverse)\n", + " library(vegan)\n", + " library(mia)\n", + " library(miaViz)\n", + " library(biomformat)\n", + " library(scater)\n", + " library(metagenomeSeq)\n", + " library(reshape2)\n", + " library(viridis)\n", + " library(patchwork)\n", + " library(IRdisplay)\n", + " library(ALDEx2)\n", + " library(ANCOMBC)\n", + " library(mikropml)\n", + " library(MLeval)\n", + " library(caret)\n", + " library(scales)\n", + "})\n", + "\n", + "display_markdown(file = '../_resources/mgnifyr_help.md')\n", + "```\n", + "\n", + "# Fetching data and preprocessing\n", + "In this example we are going to fetch MGnify analysis results and metadata for a \n", + "dataset on the human gut microbiome viewed across age and geography - \n", + "([MGYS00005154](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005154)).\n", + "\n", + "## Setting the study accession\n", + "1. First, we need to specify the accession number of the study we're working with. \n", + "This can be done by setting the `mgnify_study_accession` variable. The accession \n", + "number uniquely identifies the study in the MGnify database.\n", + "\n", + "```{r}\n", + "#| output: false\n", + "\n", + "source(\"./utils/variable_utils.r\")\n", + "\n", + "mgnify_study_accession <- get_variable_from_link_or_input('MGYS', 'Study Accession', \n", + " 'MGYS00005154')\n", + "\n", + "# You can also just directly set the accession variable in code, like this:\n", + "# mgnify_study_accession <- \"MGYS00005292\"\n", + "```\n", + "\n", + "2. To interact with the MGnify database, we need to create an MgnifyClient object. \n", + "This object allows us to fetch data from MGnify, and we can configure it to use \n", + "a cache for efficiency. \n", + "\n", + "```{r}\n", + "#| output: false\n", + "# Check if the cache directory exists, if not, create it\n", + "if (!dir.exists(\"./.mgnify_cache\")) {\n", + " dir.create(\"./.mgnify_cache\", recursive = TRUE)\n", + "}\n", + "\n", + "# Create the MgnifyClient object with caching enabled\n", + "mg <- MgnifyClient(usecache = TRUE, cache_dir = \"./.mgnify_cache\")\n", + "```\n", + "\n", + "3. Using the MgnifyClient object, we can search for all analyses associated with \n", + "the study accession number we set earlier. This will return a list of analysis \n", + "accession numbers.\n", + "\n", + "```{r}\n", + "#| output: false\n", + "analyses_accessions <- searchAnalysis(mg, \"studies\", mgnify_study_accession)\n", + "analyses_accessions\n", + "```\n", + "\n", + "## Download metadata\n", + "1. First, download the metadata for the first 20 Analyses and put it into a dataframe.\n", + "\n", + "```{r}\n", + "#| output: false\n", + "analyses_metadata_df <- getMetadata(mg, head(analyses_accessions, 30))\n", + "```\n", + "\n", + "2. The table could be big, so let's look at a sample of it (`head`).\n", + "\n", + "```{r}\n", + "#| output: false\n", + "t(head(analyses_metadata_df))\n", + "```\n", + "\n", + "## Convert to `TreeSummarizedExperiment` object.\n", + "\n", + "```{r}\n", + "#| output: false\n", + "tse <- getResult(mg, analyses_metadata_df$analysis_accession)\n", + "```\n", + "\n", + "# Normalization, alpha diversity indices and taxonomic profiles visualization\n", + "\n", + "Let's Show some stats on sequencing depth across samples.\n", + "\n", + "```{r}\n", + "# Calculate and print max difference\n", + "max_difference <- max(tse$sum) / min(tse$sum)\n", + "sprintf(\"The max difference in sequencing depth is %s\", max_difference)\n", + "```\n", + "\n", + "An approximate 4.5-fold difference in the library sizes means that we will need \n", + "to apply a normalization method before continuing with the analysis. The most \n", + "common normalization methods used in microbiome count data are proportions and rarefaction. \n", + "\n", + "However, other methods originally developed to normalize RNA-seq counts have been \n", + "adapted to differential-abundance analysis in microbiome data. A discussion about \n", + "how to choose the right normalization method is out of the scope of this material, \n", + "but the topic has been covered in multiple forums and scientific publications. \n", + "\n", + "Depending on the downstream analysis we intend to perform, different methods might \n", + "be appropriate. For instance, to compare groups of samples at community-level \n", + "through beta-diversity, “…proportions and rarefying produced more accurate comparisons \n", + "among communities and are the only methods that fully normalized read depths across \n", + "samples.\n", + "\n", + "Additionally, upper quartile, cumulative sum scaling (CSS), edgeR-TMM, and DESeq-VS \n", + "often masked differences among communities when common OTUs differed, and they \n", + "produced false positives when rare OTUs differed” ([1](#ref-mcknight2019)). On \n", + "the other hand, for detection of differentially abundant species, “both proportions \n", + "and rarefied counts result in a high rate of false positives in tests for species \n", + "that are differentially abundant across sample classes” ([2](#ref-mcmurdie2014)).\n", + "\n", + "However, rarefaction can utilize information in the full read count data, and \n", + "the debate on the pros and cons of rarefaction is ongoing ([3](#ref-schloss2024a), \n", + "[4](#ref-schloss2024b)).\n", + "\n", + "In the following examples we will show two popular ways of normalization: \n", + "relative abundance transformation (transforming features into zero mean, unit variance - \n", + "also called relative abundance transformation) and rarefaction.\n", + "\n", + "## Normalization by total sum scaling (TSS, relative abundance or proportions)\n", + "The simplest way to normalize the differences in sample size is to do a relative \n", + "abundance transformation on the OTU counts table by dividing by the number of \n", + "total reads of each sample. This type of normalization is also referred to as \n", + "relative abundance or proportions. \n", + "\n", + "We use this normalization to compare taxonomic profiles, while alpha diversity \n", + "indices are computed on the non-normalized matrix. The reason to do so is that \n", + "we need a matrix of integer numbers as input.\n", + "\n", + "1. Apply the relative abundance transformation to raw counts matrix.\n", + "\n", + "```{r}\n", + "tse <- transformAssay(tse, name = \"relabundance\", method = \"relabundance\")\n", + "```\n", + "\n", + "2. Agglomerate taxonomy at Class rank and save to the `glom_tse_relab` object \n", + "within `altExp`. Then remove all assays other than `relabundance` for simplicity.\n", + "\n", + "```{r}\n", + "# Agglomerate at class rank\n", + "altExp(tse, \"glom_tse_relab\") <- agglomerateByRank(tse, rank = \"Class\")\n", + "\n", + "# Remove assays other than relabundance\n", + "assays(altExp(tse, \"glom_tse_relab\")) <- assays(altExp(tse, \"glom_tse_relab\"))[\"relabundance\"]\n", + "```\n", + "\n", + "3. Get the top 15 features of the `relabundance` assay and subset the `altExp` \n", + "`relabundance` assay by those top 15 features.\n", + "\n", + "```{r}\n", + "# Take the top 15 features based on their means\n", + "top15_relabundance <- getTop(altExp(tse, \"glom_tse_relab\"), method = \"mean\", \n", + " assay.type = \"relabundance\", top = 15L)\n", + "\n", + "# Subset relabundance tse by top features\n", + "altExp(tse, \"glom_tse_relab\") <- altExp(tse, \"glom_tse_relab\")[top15_tss, ]\n", + "```\n", + "\n", + "## Normalization by subsampling (rarefaction)\n", + "Rarefaction is an alternative to doing a relative abundance transformation to \n", + "obtain an adjusted OTUs count matrix. The method is based on a process of subsampling \n", + "to the smallest library size in the data set. \n", + "\n", + "The algorithm randomly removes reads until the samples reach the same library size. \n", + "Despite the apparent disadvantage of discarding information from the larger samples, \n", + "rarefaction is quite popular in microbial ecology. \n", + "\n", + "The first step is to find the smallest sample size. We can use the number of \n", + "observed OTUs in the original matrix to do so.\n", + "\n", + "1. Find the smallest sample size.\n", + "\n", + "```{r}\n", + "min_size <- min(colSums2(assay(tse, \"counts\")))\n", + "```\n", + "\n", + "2. Rarefying to the smallest sample.\n", + "\n", + "```{r}\n", + "tse <- rarefyAssay(tse, assay.type = \"counts\", size = min_size, name = \"rarefy\")\n", + "```\n", + "\n", + "3. Agglomerate taxonomy at Class rank and save to the `glom_tse` object within `altExp`. \n", + "Then remove all assays other than `rarefy` for simplicity.\n", + "\n", + "```{r}\n", + "# Agglomerate at class rank\n", + "altExp(tse, \"glom_tse_rarefy\") <- agglomerateByRank(tse, rank = \"Class\")\n", + "\n", + "# Remove assays other than rarefy\n", + "assays(altExp(tse, \"glom_tse_rarefy\")) <- assays(altExp(tse, \"glom_tse_rarefy\"))[\"rarefy\"]\n", + "```\n", + "\n", + "\n", + "4. Get the top 15 features of the `rarefy` assay and subset the `altExp`\n", + "`rarefy` assay by those top 15 features. \n", + "\n", + "```{r}\n", + "# Get the top features\n", + "top15_rarefy <- getTop(altExp(tse, \"glom_tse_rarefy\"), top = 15L, \n", + " assay.type = \"rarefy\", method = \"mean\")\n", + "\n", + "# Subset original rarefy tse by top features\n", + "altExp(tse, \"glom_tse_rarefy\") <- altExp(tse, \"glom_tse_rarefy\")[top15_rarefy, ]\n", + "```\n", + "\n", + "## OTU Filtering\n", + "\n", + "1. First, identify the total counts per sample and add them to the colData of our \n", + "`tse` object.\n", + "\n", + "```{r}\n", + "tse <- addPerCellQC(tse)\n", + "```\n", + "\n", + "2. We then plot a histogram to identify outliers, so we can remove samples with \n", + "low coverage.\n", + "\n", + "```{r}\n", + "scatter_plot <- plotColData(tse, y = \"sum\") +\n", + " scale_y_log10(\n", + " breaks = trans_breaks(\"log10\", function(x) 10^x, n = 8),\n", + " labels = trans_format(\"log10\", math_format(10^.x))\n", + " ) +\n", + " labs(x = \"Samples\", y = \"Library size (log scale)\") \n", + "```\n", + "\n", + "3. We can see that samples with number of reads above 10^6.2 seem to be outliers. \n", + "Let’s filter out the outliers and plot a new histogram.\n", + "\n", + "At the same time, we can also filter out singletons. Singletons are OTUs of size \n", + "one, meaning that only one read was assigned to that OTU. These very low-abundance \n", + "OTUs could be biologically real (belonging to the rare biosphere ([5](#ref-lynch2015))), \n", + "or they could be false positives due to sequencing artefacts. Singletons observed \n", + "in only one sample are more likely to be artefacts, and it is good practice to \n", + "remove them from the OTUs counts table to avoid artificially over-estimating the \n", + "OTUs richness. You can find more discussion about this in [Robert Edgar’s blog](https://drive5.com/usearch/manual/singletons.html).\n", + "\n", + "```{r}\n", + "# Subset the tse object to keep only samples with library sizes > 10^6.2 \n", + "# and non-singleton taxa in the filtered data\n", + "tse <- tse[rowSums(assay(tse, \"counts\")) > 1, tse$sum < 10^6.2]\n", + "\n", + "# Now we redo the scatter plot\n", + "filtered_scatter_plot <- plotColData(tse, y = \"sum\") +\n", + " scale_y_log10(\n", + " breaks = trans_breaks(\"log10\", function(x) 10^x, n = 8),\n", + " labels = trans_format(\"log10\", math_format(10^.x))\n", + " ) +\n", + " labs(x = \"Samples\", y = \"Library size (log scale)\") \n", + "```\n", + "\n", + "## Computing alpha diversity indices\n", + "Alpha diversity is a measure of species diversity in a particular area or an ecosystem. \n", + "In terms of microbial ecology, analyzing the sample diversity of sequencing data \n", + "is commonly the first step in assessing differences between microbial environments. \n", + "Some of the most popular alpha diversity indices reported in microbial community \n", + "analyses are Chao, abundance-based coverage estimators (ACE), Simpson, and Shannon-Weaver.\n", + "\n", + "Chao1 and ACE are nonparametric estimators that describe diversity as the max \n", + "number of expected species in the sample (species richness). They consider the \n", + "proportion of species that have been observed before (“recaptured”) to those that \n", + "are observed only once. Both Chao1 and ACE underestimate true richness at low sample sizes.\n", + "\n", + "On the other hand, Simpson and Shannon-Weaver, consider relative abundances and \n", + "depends on both, species richness and the evenness (or equitableness), with which \n", + "individuals are distributed among the different species. However, both metrics have \n", + "specific biases. The Shannon-Weaver index places a greater weight on species richness, \n", + "whereas the Simpson index considers evenness more than richness in its measurement.\n", + "\n", + "More discussion and examples illustrating how sample size and normalization methods \n", + "can affect alpha diversity metrics in references [6](#ref-willis2019) and [7](#ref-kim2017).\n", + "\n", + "In this example, we're going to calculate the diversity measures for our original \n", + "counts data, and for our rarefied original counts data.\n", + "\n", + "1. Now, let's use the `addAlpha` method to compute the alpha diversity indices \n", + "on original raw counts data, and we use the `niter` argument to rarefy our original \n", + "counts data to calculate the shannon diversity for rarefied data. We also add a \n", + "`SampleID` colData column to `tse` to help with out plotting too. \n", + "\n", + "For this example we'll use the `shannon_diversity` index to show alpha diversity, \n", + "though [see here](https://microbiome.github.io/mia/reference/addAlpha.html) for \n", + "more index options.\n", + "\n", + "```{r}\n", + "# Add SampleID to colData\n", + "tse$SampleID <- colnames(tse)\n", + "\n", + "# Compute alpha diversity on normal counts data\n", + "tse <- addAlpha(tse, index = \"shannon_diversity\", name = \"shannon_diversity_normal\")\n", + "\n", + "# Compute alpha diversity on rarefied data\n", + "tse <- addAlpha(tse, index = \"shannon_diversity\", name = \"shannon_diversity_rarefy\", \n", + " niter = 10)\n", + "```\n", + "\n", + "2. Plot alpha diversity with `plotColData`, generating a plot for our shannon \n", + "diversity calculation on both our original counts data and rarefied data. We save \n", + "both plots to `plots`.\n", + "\n", + "```{r}\n", + "# Use lapply to generate plots\n", + "plots <- lapply(c(\"shannon_diversity_normal\", \"shannon_diversity_rarefy\"), function(y) {\n", + " label <- ifelse(y == \"shannon_diversity_normal\", \"normal\", \"rarefy\")\n", + " plotColData(tse, x = \"SampleID\", y = y, colour_by = \"SampleID\") +\n", + " labs(y = paste(\"Shannon Diversity Index (\", label, \")\", sep = \"\")) +\n", + " theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6))\n", + "})\n", + "```\n", + "\n", + "3. Combine all plots using patchwork.\n", + "\n", + "```{r}\n", + "# Combine all plots using patchwork\n", + "wrapped_plots <- wrap_plots(plots) +\n", + " plot_layout(guides = \"collect\") + \n", + " plot_annotation(title = \"Shannon Diversity Calculations\",\n", + " theme = theme(plot.title = element_text(size = 12, hjust = 0.5)))\n", + "```\n", + "\n", + "## Community profile visualization\n", + "Here we visualise the taxonomic profiles in barplots at class ranks for our top \n", + "tss normalized taxa and rarefied taxa.\n", + "\n", + "1. Let's store our `glom_tse_relab` and `glom_tse_rarefy` objects in \n", + "variables to make referencing them easier.\n", + "\n", + "```{r}\n", + "glom_tse_relab <- altExp(tse, \"glom_tse_relab\")\n", + "glom_tse_rarefy <- altExp(tse, \"glom_tse_rarefy\")\n", + "```\n", + "\n", + "\n", + "2. Lets change any unknown taxonomic rowData information to \"Unknown\" from \"NA\" \n", + "in our `tse` objects.\n", + "\n", + "```{r}\n", + "rowData(glom_tse_relab)$Class <- ifelse(is.na(rowData(glom_tse_relab)$Class), \"Unknown\", \n", + " rowData(glom_tse_relab)$Class)\n", + "rowData(glom_tse_rarefy)$Class <- ifelse(is.na(rowData(glom_tse_rarefy)$Class), \"Unknown\", \n", + " rowData(glom_tse_rarefy)$Class)\n", + "```\n", + "\n", + "3. Create the plots with the `plotAbundance` `mia` method. We use our agglomerated \n", + "`relabundance` and `rarefy` assays from our `tse` objects to do this.\n", + "\n", + "```{r}\n", + "# Plotting our tss data\n", + "relabundance_plot <- plotAbundance(glom_tse_relab,\n", + " assay.type = \"relabundance\",\n", + " rank = \"Class\",\n", + " order_rank_by = \"abund\")\n", + "\n", + "# Plotting our rarefied data\n", + "rarefy_plot <- plotAbundance(glom_tse_rarefy,\n", + " assay.type = \"rarefy\",\n", + " rank = \"Class\",\n", + " order_rank_by = \"abund\")\n", + "```\n", + "\n", + "By naked eye, we can see that the profiles change. Let’s take a deeper look at \n", + "some interesting Classes to check how the abundance change depending on the \n", + "normalization method.\n", + "\n", + "1. Define our subset classes\n", + "\n", + "```{r}\n", + "subset_classes <- c(\"Betaproteobacteria\", \n", + " \"Bacteroidia\", \n", + " \"Actinobacteria\")\n", + "```\n", + "\n", + "\n", + "2. Subset our `tse` objects.\n", + "\n", + "```{r}\n", + "glom_tse_relab <- glom_tse_relab[rowData(glom_tse_relab)$Class %in% subset_classes, ]\n", + "glom_tse_rarefy <- glom_tse_rarefy[rowData(glom_tse_rarefy)$Class %in% subset_classes, ]\n", + "```\n", + "\n", + "3. Plot our new plots with the `plotAbundance` `mia` method.\n", + "\n", + "```{r}\n", + "relabundance_plot <- plotAbundance(glom_tse_relab, \n", + " assay.type = \"relabundance\", \n", + " rank = \"Class\", \n", + " order_rank_by = \"abund\")\n", + "\n", + "rarefy_plot <- plotAbundance(glom_tse_rarefy, \n", + " assay.type = \"rarefy\", \n", + " rank = \"Class\", \n", + " order_rank_by = \"abund\")\n", + "```\n", + "\n", + "## Comparative metagenomics at community level: Beta diversity\n", + "\n", + "Beta (β)-diversity is used in ecology to assess the heterogeneity of local communities. \n", + "It is defined as the ratio between regional and local species diversity. In simpler \n", + "terms, it calculates the number of species that are not the same in two different \n", + "environments.\n", + "\n", + "Some of the most popular beta diversity measures in microbiome research include \n", + "Bray-Curtis index (for compositional data), Jaccard index (for presence/absence \n", + "data, ignoring abundance information), Aitchison distance (Euclidean distance \n", + "for transformed abundances, aiming to avoid the compositionality bias), and the \n", + "Unifrac distances (that use the phylogenetic tree information). \n", + "\n", + "Only some of the commonly used beta diversity measures are actual distances, \n", + "therefore, the term dissimilarity or beta-diversity is commonly used. The workflow \n", + "to assess β-diversity between groups of samples starts with calculating the diversity \n", + "indices for each pair of samples. This results in a distance (or dissimilarity) \n", + "matrix that is often used for ordination (e.g., principal coordinates analysis) \n", + "and data exploration in microbiota data analysis ([8](#ref-wagner2018)).\n", + "\n", + "For statistical analysis, [adonis](https://www.rdocumentation.org/packages/vegan/versions/2.6-4/topics/adonis) \n", + "and [betadisper](https://www.rdocumentation.org/packages/vegan/versions/2.6-4/topics/betadisper) \n", + "methods are widely used together in β-diversity analyses.\n", + "\n", + "PERMANOVA tests whether two or more groups have different compositions. PERMANOVA \n", + "calculates the squared deviations of each site to the centroid and then, performs \n", + "significance tests using F-tests based on sequential sums of squares from \n", + "permutations of the raw data. It can be seen as an ANOVA using distance matrices \n", + "(analogous to MANOVA). A non significant p-value (p>0.05), means that there’s no \n", + "significant difference in composition between groups (we accept the null hypothesis \n", + "of no significant difference in composition between groups).\n", + "\n", + "Betadisper tests if two or more groups are homogeneously dispersed in relation to \n", + "their species in studied samples. This test can be done to see if one group has \n", + "more compositional variance than another. The method use the distance matrix to \n", + "calculate the multivariate dispersions (variances; average distance to centroids). \n", + "\n", + "Then we use group dispersions to perform an ANOVA test. ANOVA’s p-value not significant \n", + "(p>0.05) means that group dispersions are homogenous (null hypothesis of no difference \n", + "in dispersion between groups).\n", + "\n", + "Compute beta diversity using various methods to calculate distance, and perform \n", + "principle components analysis ploting the first two axes. According to Pereira \n", + "et al., (2018)([9](#ref-pereira2018)), the best normalization method for metagenomic \n", + "gene abundance is CSS for large group sizes. For this reason, we will use this \n", + "method to show beta-diversity. \n", + "\n", + "Using as a guide the steps described in ([10](#ref-mcmurdie2018)), we will create \n", + "a list of suitable distance methods, iterate through them, and display a combined \n", + "plot. For a better visualization we are going to show the 95% confidence region \n", + "with an ellipse.\n", + "\n", + "1. Let's define the list of methods we're going to test.\n", + "\n", + "```{r}\n", + "# Define method list\n", + "vegdist_methods <- c(\"manhattan\", \"euclidean\", \"bray\", \"jaccard\",\n", + " \"gower\", \"chao\")\n", + "\n", + "# Create empty vectors to hold method names and plots\n", + "created <- c()\n", + "plots <- c()\n", + "```\n", + "\n", + "2. Loop through these method types and plot abundance for each, saving each plot \n", + "into the `plots` vector.\n", + "\n", + "```{r}\n", + "# Loop through these methods and plot abundance\n", + "for (method in vegdist_methods) {\n", + " # Set the method name\n", + " method_name <- paste(\"MDS\", method, sep = \"_\")\n", + " created <- method_name\n", + " \n", + " # Run MDS \n", + " mds_tse <- runMDS(tse,\n", + " FUN = vegan::vegdist,\n", + " method = method,\n", + " assay.type = \"relabundance\",\n", + " name = method_name)\n", + " \n", + " # Create plot object of reducedDim slot\n", + " p <- plotReducedDim(mds_tse, dimred = paste(\"MDS\", method, sep = \"_\"))\n", + " \n", + " # Calculate explained variance\n", + " e <- attr(reducedDim(mds_tse, method_name), \"eig\")\n", + " rel_eig <- e / sum(e[e > 0])\n", + " \n", + " # Add explained variance for each axis\n", + " p <- p + labs(x = paste(\"MDS_1_\", method, \" (\", round(100 * rel_eig[1], 1), \"%)\", \n", + " sep = \"\"),\n", + " y = paste(\"MDS_2_\", method, \" (\", round(100 * rel_eig[2], 1), \"%)\", \n", + " sep = \"\"),\n", + " title = paste(\"MDS using distance method\", method)) +\n", + " theme_minimal(base_size = 6) + # Use a minimal theme for clarity\n", + " theme(\n", + " plot.title = element_text(hjust = 0.5), # Center the title\n", + " axis.title = element_text(size = 6), # Smaller axis titles\n", + " axis.text = element_text(size = 6), # Smaller axis labels\n", + " legend.position = \"bottom\", # Position the legend at the bottom\n", + " legend.title = element_text(size = 6), # Smaller legend title\n", + " legend.text = element_text(size = 6) # Smaller legend text\n", + " )\n", + " \n", + " # Save plot\n", + " plots[[method_name]] <- p\n", + "}\n", + "```\n", + "\n", + "3. Combine all plots using patchwork.\n", + "\n", + "```{r}\n", + "# Combine all plots using patchwork\n", + "wrapped_plots <- wrap_plots(plots) +\n", + " plot_layout(guides = \"collect\") + \n", + " plot_annotation(title = \"MDS on Various Distance Metrics\",\n", + " theme = theme(plot.title = element_text(size = 14, hjust = 0.5)))\n", + "```\n", + "\n", + "Separation of the groups observed in some of the ordination plots could be due to \n", + "a location effect, dispersion effect, or both. To test the location effect (null \n", + "hypothesis of no difference in composition between groups) we use a permanova test, \n", + "using the bray method to calculate the distance matrix. \n", + "\n", + "PERMANOVA assumes there is homogeneity of dispersion among groups. We want to also \n", + "test this assumption, to check whether the differences detected by our permanova \n", + "test are due to variation in dispersion of the data. \n", + "\n", + "We do this using the betadisper method to calculate variances and then evaluate \n", + "if there’s a significant variation between groups through an anova.\n", + "\n", + "To do all these tests, we use the `addRDA` method, which will do all of them for us.\n", + "\n", + "```{r}\n", + "beta_diversity_tse <- addRDA(\n", + " tse,\n", + " formula = data ~ sample_geographic.location..country.and.or.sea.region.,\n", + " assay.type = \"relabundance\",\n", + " method = \"bray\",\n", + " test.signif = TRUE,\n", + " homogeneity.test = \"anova\",\n", + " full = TRUE\n", + ")\n", + "```\n", + "\n", + "We now plot and fetch the results\n", + "\n", + "```{r}\n", + "# Plot the results\n", + "plotReducedDim(beta_diversity_tse, \"RDA\", \n", + " colour_by = \"sample_geographic.location..country.and.or.sea.region.\")\n", + "\n", + "# Fetch significance results, including PERMANOVA and betadisper (homogeneity) results\n", + "significance_results <- attr(reducedDim(beta_diversity_tse, \"RDA\"), \"significance\")\n", + "significance_results\n", + "```\n", + "\n", + "ANOVA's p-value is not significant (p>0.05), meaning that there's no significant \n", + "difference in dispersion among groups. The PERMANOVA however showed a significant \n", + "P value (p<0.05), meaning significant differences in community composition between \n", + "the groups. \n", + "\n", + "## Detection of differentially abundant taxa\n", + "There are many approaches out there to detect differentially abundant taxa (or genes).\n", + "Here we detect microbiome community composition changes due to various environmental \n", + "factors. Describing such associations in quantitative models has multiple applications, \n", + "such as in predicting disease status based on microbiome data. \n", + "\n", + "In addition, the pipeline includes the generation of high-quality plots for visual \n", + "inspection of results ([11](#ref-wirbel2021)). There are multiple parameters that \n", + "can be adjusted for better performance in each step of the analysis. However, in \n", + "this exercise we are using default arguments with minimal tuning.\n", + "\n", + "### Association testing\n", + "\n", + "1. First we use our original counts data on our `tse` object and agglomerate \n", + "by rank to a new `genus_tse` object.\n", + "\n", + "```{r}\n", + "genus_tse <- agglomerateByRank(tse, rank = \"Genus\")\n", + "```\n", + "\n", + "2. Subset the dataset by prevalence to include only the taxa that are present in \n", + "at least 25% of the samples.\n", + "\n", + "```{r}\n", + "genus_tse <- subsetByPrevalent(genus_tse, prevalence = 0.25, include.lowest = TRUE)\n", + "```\n", + "\n", + "3. Subset our `genus_tse` samples into 2 geographic groups, and add an extra `colData` \n", + "column to accommodate this.\n", + "\n", + "```{r}\n", + "# Create a new condition vector\n", + "conditions <- ifelse(genus_tse$sample_geographic.location..country.and.or.sea.region. \n", + " == \"GAZ:United States of America\", \"USA\", \"Other\")\n", + "\n", + "# Add conditions as a new column in the colData of genus_tse\n", + "genus_tse$geo_location <- conditions\n", + "```\n", + "\n", + "4. Now we use the `ALDEx2` package to compute the differential abundance analysis.\n", + "\n", + "```{r}\n", + "# Set seed for reproducibility\n", + "set.seed(123)\n", + "\n", + "# Generate Monte Carlo samples of the Dirichlet distribution for each sample using \n", + "# the geographic location for grouping\n", + "x <- aldex.clr(assay(genus_tse, \"counts\"), genus_tse$geo_location)\n", + "\n", + "# Calculates expected values of the Welch's t-test and Wilcoxon rank\n", + "x_tt <- aldex.ttest(x, paired.test = FALSE, verbose = FALSE)\n", + "\n", + "# Determine the median clr abundance of the feature in all samples and in groups, \n", + "# the median difference between the two groups, the median variation within each \n", + "# group and the effect size - which is the median of the ration of the between \n", + "# group difference and the larger of the variance within groups\n", + "x_effect <- aldex.effect(x, CI = TRUE, verbose = FALSE)\n", + "\n", + "# Now combine all outputs into a data frame\n", + "aldex_out <- data.frame(x_tt, x_effect)\n", + "\n", + "# Now we plot a Bland-ALtman plot (left) and a dispersion plot (right)\n", + "par(mfrow = c(1, 2))\n", + "\n", + "# Set up a 1x2 plotting area\n", + "par(mfrow = c(1, 2))\n", + "\n", + "# Bland-Altman plot (left)\n", + "aldex.plot(aldex_out,\n", + " type = \"MA\",\n", + " test = \"welch\",\n", + " xlab = \"Log-ratio abundance\",\n", + " ylab = \"Difference\",\n", + " cutoff = 0.05)\n", + "\n", + "# Dispersion plot (right)\n", + "aldex.plot(aldex_out,\n", + " type = \"MW\",\n", + " test = \"welch\",\n", + " xlab = \"Dispersion\",\n", + " ylab = \"Difference\",\n", + " cutoff = 0.05)\n", + "\n", + "# Reset plotting layout\n", + "par(mfrow = c(1, 1))\n", + "\n", + "# Then do some further evaluation on the results\n", + "aldex_out %>%\n", + " rownames_to_column(var = \"Genus\") %>%\n", + " # here we choose the wilcoxon output rather than t-test output\n", + " filter(wi.eBH <= 0.05) %>%\n", + " dplyr::select(Genus, we.eBH, wi.eBH, effect, overlap) %>%\n", + " knitr::kable()\n", + "```\n", + "\n", + "## Confounder Testing \n", + "\n", + "Test the associated metadata variables for potential confounding influence. As many \n", + "biological and technical factors beyond the primary phenotype of interest can influence\n", + "microbiome composition, simple association studies may suffer confounding by other \n", + "variables, which can lead to spurious results. \n", + "\n", + "Associations are visualized either as barplot or Q-Q plot, depending on the type \n", + "of metadata. Additionally, it evaluates associations among metadata variables \n", + "using conditional entropy and associations with the label using generalized linear \n", + "models, producing a correlation heatmap and appropriate quantitative barplots, \n", + "respectively.\n", + "\n", + "Here we use the `ANCOMBC` package to test for cofounders.\n", + "\n", + "1. Compute and store the library size in `colData`.\n", + "\n", + "```{r}\n", + "genus_tse$library_size <- colSums(assay(genus_tse, \"counts\"))\n", + "```\n", + "\n", + "2. The `ancombc2_out` method performs the differential abundance analysis whilst \n", + "accounting for multiple covariates and potential confounders. \n", + "\n", + "```{r}\n", + "# Perform the analysis \n", + "ancombc2_out <- ancombc2(genus_tse,\n", + " assay.type = \"counts\",\n", + " fix_formula = \"sample_geographic.location..country.and.or.sea.region. + library_size\",\n", + " p_adj_method = \"fdr\",\n", + " lib_cut = 0,\n", + " group = \"sample_geographic.location..country.and.or.sea.region.\", \n", + " struc_zero = TRUE, \n", + " neg_lb = TRUE,\n", + " alpha = 0.05,\n", + " global = TRUE)\n", + "```\n", + "\n", + "## Generation of a predictive model: Machine learning workflow\n", + "\n", + "We'll now use the `mikropml` library to construct a machine learning meta-analysis\n", + "models on the basis of microbial markers. The steps to follow include functions \n", + "for data normalization, splitting the data into cross-validation folds, training \n", + "the model, and making predictions based on cross-validation instances and the trained \n", + "models. \n", + "\n", + "Such a model can be built on one dataset and then be applied on another, similarly \n", + "processed holdout dataset. This might be of interest when comparing data from two \n", + "different studies testing the same condition.\n", + "\n", + "We will work on our original full counts `tse` object.\n", + "\n", + "1. Normalize the data using CLR transformation.\n", + "\n", + "```{r}\n", + "tse <- transformAssay(tse, method = \"clr\", MARGIN = \"samples\", pseudocount = 1)\n", + "assay <- assay(tse, \"clr\")\n", + "```\n", + "\n", + "2. Prepare the cross-validation data frame. Cross-validation is a technique to \n", + "assess how well a machine learning model would generalize to external data by \n", + "partitioning the dataset into training and test sets. Here, we prepare the data \n", + "frame needed for cross-validation.\n", + "\n", + "```{r}\n", + "# Prepare the cross-validation data frame.\n", + "# Transpose the assay matrix to have samples as rows and features (taxa) as columns\n", + "assay <- t(assay)\n", + "\n", + "# Convert the transposed matrix to a data frame for easier manipulation and analysis\n", + "df <- as.data.frame(assay)\n", + "\n", + "# Extract the geographic location labels from the colData of the tse object\n", + "labels <- colData(tse)$sample_geographic.location..country.and.or.sea.region.\n", + "\n", + "# Simplify the labels to make them valid R variable names\n", + "labels <- gsub(\" \", \"_\", labels)\n", + "labels <- gsub(\"[:.]\", \"\", labels)\n", + "\n", + "# Convert the geographic location labels to a factor (categorical variable)\n", + "labels <- as.factor(labels)\n", + "\n", + "# Add the geographic location labels as a new column in the data frame\n", + "df$geo_location <- labels\n", + "```\n", + "\n", + "3. Train the model using `mikropml`. Model training involves selecting and applying \n", + "a machine learning algorithm to the data. Here, we use the random forest algorithm \n", + "(rf) to train our model. We do 10 folds for our cross-validation, and repeat the \n", + "process 5 times.\n", + "\n", + "```{r}\n", + "results <- run_ml(df, \"rf\", outcome_colname = \"geo_location\", kfold = 10, \n", + " cv_times = 5, training_frac = 0.8)\n", + "```\n", + "\n", + "4. Model evaluation involves assessing the performance of the trained model. \n", + "Here, we use a confusion matrix to evaluate the model's predictions. \n", + "\n", + "A confusion matrix provides a summary of prediction results on a classification \n", + "problem, showing the number of correct and incorrect predictions made by the model \n", + "compared to the actual outcomes.\n", + "\n", + "```{r}\n", + "conf_matrix <- confusionMatrix(data = results$trained_model$finalModel$predicted, \n", + " reference = results$trained_model$finalModel$y)\n", + "```\n", + "\n", + "5. Evaluate model using `MLEval`. This generates detailed evaluation metrics.\n", + "\n", + "```{r}\n", + "res <- evalm(results$trained_model, showplots = FALSE)\n", + "```\n", + "\n", + "6. Plot ROC and precision-recall curves side-by-side. Visualization is an important \n", + "step to understand the performance of the model. Here, we plot the ROC and precision-recall \n", + "curves.\n", + "\n", + "```{r}\n", + "roc_pr_plot <- res$roc + res$proc + plot_layout(guides = \"collect\") & theme(legend.position = 'bottom')\n", + "```\n", + "\n", + "# References\n", + "\n", + "1. McKnight, DT., Huerlimann, R., Bower, DS. et al. \n", + "Methods for Normalizing Microbiome Data: An Ecological Perspective. Methods in \n", + "Ecology and Evolution / British Ecological Society 10(3):389–400 (2019) \n", + "[DOI:10.1111/2041-210X.13115](https://doi.org/10.1111/2041-210X.13115)\n", + "\n", + "2. McMurdie, PJ., Holmes, S. Waste Not, Want Not: \n", + "Why Rarefying Microbiome Data Is Inadmissible. PLoS Comput Biol 10(4):e1003531 (2014) \n", + "[DOI:10.1371/journal.pcbi.1003531](https://doi.org/10.1371/journal.pcbi.1003531)\n", + "\n", + "3. Schloss, Patrick D. ‘Waste Not, Want Not: \n", + "Revisiting the Analysis That Called into Question the Practice of Rarefaction’. \n", + "mSphere 9, no. 1 (2024): e00355-23 [DOI:10.1128/msphere.00355-23](https://journals.asm.org/doi/10.1128/msphere.00355-23)\n", + "\n", + "4. Schloss PD. 2024. Rarefaction is currently the \n", + "best approach to control for uneven sequencing effort in amplicon sequence analyses. \n", + "mSphere 9 [DOI:10.1128/msphere.00354-23](https://journals.asm.org/doi/10.1128/msphere.00354-23)\n", + "\n", + "5. Lynch, M., Neufeld, J. Ecology and exploration of \n", + "the rare biosphere. Nat Rev Microbiol 13:217–229 (2015) [DOI:10.1038/nrmicro3400](https://doi.org/10.1038/nrmicro3400)\n", + "\n", + "6. Willis, A. Rarefaction, Alpha Diversity, and Statistics. \n", + "Front. Microbiol. Sec. Terrestrial Microbiology 10:2407 (2019) [DOI:10.3389/fmicb.2019.02407](https://doi.org/10.3389/fmicb.2019.02407)\n", + "\n", + "7. Kim, BR., Shin, J., Guevarra, R. et al. Deciphering \n", + "Diversity Indices for a Better Understanding of Microbial Communities. J Microbiol \n", + "Biotechnol 28;27(12):2089-2093 (2017) [DOI:10.4014/jmb.1709.09027](https://doi.org/10.4014/jmb.1709.09027)\n", + "\n", + "8. Wagner, BD., Grunwald, GK., Zerbe, GO. et al. On \n", + "the Use of Diversity Measures in Longitudinal Sequencing Studies of Microbial \n", + "Communities. Front. Microbiol. 9:1037 (2018) [DOI:10.3389/fmicb.2018.01037](https://doi.org/10.3389/fmicb.2018.01037)\n", + "\n", + "9. Pereira, M., Wallroth, M., Jonsson, V. et al. \n", + "Comparison of normalization methods for the analysis of metagenomic gene abundance \n", + "data. BMC Genomics 19,274 (2018) [DOI:10.1186/s12864-018-4637-6](https://doi.org/10.1186/s12864-018-4637-6)\n", + "\n", + "10. McMurdie, P., Holmes, S. Tutorial of 2018: The \n", + "distance function in phyloseq. Available at website\n", + "\n", + "11. Wirbel, J., Zych, K., Essex, M. et al. Microbiome \n", + "meta-analysis and cross-disease comparison enabled by the SIAMCAT machine learning \n", + "toolbox. Genome Biol 22:93 (2021) [DOI:10.1186/s13059-021-02306-1](https://doi.org/10.1186/s13059-021-02306-1)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/src/notebooks/R Mia Examples/Fetch-Analyses-metadata-for-a-Study.ipynb b/src/notebooks/R Mia Examples/Fetch-Analyses-metadata-for-a-Study.ipynb new file mode 100644 index 0000000..338271b --- /dev/null +++ b/src/notebooks/R Mia Examples/Fetch-Analyses-metadata-for-a-Study.ipynb @@ -0,0 +1,143 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "title: \"Fetch analyses metadata for a study\"\n", + "author:\n", + " - name: Noah de Gunst\n", + " affiliation:\n", + " - id: mia\n", + " name: Department of Computing, University of Turku, Finland\n", + " - name: Sam Hillman\n", + " affiliation:\n", + " - id: mia\n", + " name: Department of Computing, University of Turku, Finland\n", + "categories: [R]\n", + "execute: \n", + " eval: true\n", + "---\n", + "\n", + "::: {style=\"max-width:1200px\"}\n", + "![](./../_resources/mgnify_logo.png)\n", + "\n", + "# Fetch a study using MGnifyR; download the metadata for all of its analyses\n", + "\n", + "The [MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1) returns data and \n", + "relationships as JSON. [MGnifyR](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html) \n", + "is a package to help you read MGnify data into your R analyses.\n", + "\n", + "You can find all of the other \"API endpoints\" using the [Browsable API interface in your web browser](https://www.ebi.ac.uk/metagenomics/api/v1).\n", + "\n", + "This is an interactive code notebook (a Jupyter Notebook). To run this code, click \n", + "into each cell and press the ▶ button in the top toolbar, or press `shift+enter`.\n", + "\n", + "------------------------------------------------------------------------\n", + ":::\n", + "\n", + "#### Setting the access code\n", + "First, we need to specify the accession number of the study we're working with. \n", + "This can be done by setting the `mgnify_study_accession` variable. The accession \n", + "number uniquely identifies the study in the MGnify database.\n", + "\n", + "```{r}\n", + "#| output: false\n", + "\n", + "source(\"./utils/variable_utils.r\")\n", + "\n", + "mgnify_study_accession <- get_variable_from_link_or_input('MGYS', 'Study Accession', \n", + " 'MGYS00005116')\n", + "\n", + "# You can also just directly set the accession variable in code, like this:\n", + "# mgnify_study_accession <- \"MGYS00005292\"\n", + "```\n", + "\n", + "#### Constructing a MgnifyClient object to access the database\n", + "To interact with the MGnify database, we need to create an MgnifyClient object. \n", + "This object allows us to fetch data from MGnify, and we can configure it to use \n", + "a cache for efficiency. \n", + "\n", + "```{r}\n", + "#| output: false\n", + "# Importing the libraries\n", + "library(vegan)\n", + "library(ggplot2)\n", + "library(mia)\n", + "library(MGnifyR)\n", + "\n", + "# Check if the cache directory exists, if not, create it\n", + "if (!dir.exists(\"./.mgnify_cache\")) {\n", + " dir.create(\"./.mgnify_cache\", recursive = TRUE)\n", + "}\n", + "\n", + "# Create the MgnifyClient object with caching enabled\n", + "mg <- MgnifyClient(usecache = TRUE, cache_dir = \"./.mgnify_cache\")\n", + "```\n", + "\n", + "#### Displaying the help file\n", + "\n", + "```{r}\n", + "#| output: false\n", + "library(IRdisplay)\n", + "display_markdown(file = '../_resources/mgnifyr_help.md')\n", + "```\n", + "\n", + "## Fetch a list of the Analyses for the Study\n", + "Using the MgnifyClient object, we can search for all analyses associated with the \n", + "study accession number we set earlier. This will return a list of analysis accession \n", + "numbers.\n", + "\n", + "```{r}\n", + "#| output: false\n", + "analyses_accessions <- searchAnalysis(mg, \"studies\", mgnify_study_accession)\n", + "analyses_accessions\n", + "```\n", + "\n", + "## Download metadata for the first 10 Analyses\n", + "...and put it into a dataframe.\n", + "\n", + "```{r}\n", + "#| output: false\n", + "analyses_metadata_df <- getMetadata(mg, head(analyses_accessions, 10))\n", + "```\n", + "\n", + "## Display metadata\n", + "The table could be big, so let's look at a sample of it (`head`).\n", + "\n", + "```{r}\n", + "#| output: false\n", + "t(head(analyses_metadata_df))\n", + "```\n", + "\n", + "## Download the data to a multi-assay data object\n", + "\n", + "> [mia](https://microbiome.github.io/mia/) is a Bioconductor package designed to \n", + "import, store and analyze microbiome data using an object called a `TreeSummarizedExperiment.` \n", + "This is a tailored data container optimized for microbiome data analysis.Being \n", + "built on the `SummarizedExperiment` class, miaverse seamlessly integrates into the \n", + "extensive `SummarizedExperiment` ecosystem. In this example we download the MGnifyR \n", + "data to an MAE, which contains multiple `TreeSummarizedExperiment` objects.\n", + "\n", + "\n", + "```{r}\n", + "#| output: false\n", + "mae <- getResult(mg, accession = analyses_accessions)\n", + "```\n", + "\n", + "You use `MGnifyR` features further, for example to download data. Check the Cheat \n", + "Sheet at the top for more." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/src/notebooks/R Mia Examples/_resources/mgnify_banner.png b/src/notebooks/R Mia Examples/_resources/mgnify_banner.png new file mode 100644 index 0000000..0c504d2 Binary files /dev/null and b/src/notebooks/R Mia Examples/_resources/mgnify_banner.png differ diff --git a/src/notebooks/R Mia Examples/_resources/mgnify_logo.png b/src/notebooks/R Mia Examples/_resources/mgnify_logo.png new file mode 100644 index 0000000..d4f4fd5 Binary files /dev/null and b/src/notebooks/R Mia Examples/_resources/mgnify_logo.png differ diff --git a/src/notebooks/R Mia Examples/_resources/mgnifyr_help.md b/src/notebooks/R Mia Examples/_resources/mgnifyr_help.md new file mode 100644 index 0000000..668fa08 --- /dev/null +++ b/src/notebooks/R Mia Examples/_resources/mgnifyr_help.md @@ -0,0 +1,12 @@ +# Help with MGnifyR + +MGnifyR is an R package that provides a convenient way for R users to access data from [the MGnify API](https://www.ebi.ac.uk/metagenomics/api/). + +Detailed help for each function is available in R using the standard `?function_name` command. + +A vignette is available containing a reasonably verbose overview of the main functionality. +This can be read either within R with the `vignette("MGnifyR")` command, or [on the bioconductor vignette website](https://www.bioconductor.org/packages/release/bioc/vignettes/MGnifyR/inst/doc/MGnifyR.html) + +## MGnifyR Command cheat sheet + +For a full list of key MGnifyR functions, please look at the [MGnifyR website](https://ebi-metagenomics.github.io/MGnifyR/reference/index.html). diff --git a/src/notebooks/R Mia Examples/utils/kegg_pathways_utils.r b/src/notebooks/R Mia Examples/utils/kegg_pathways_utils.r new file mode 100644 index 0000000..c2f803e --- /dev/null +++ b/src/notebooks/R Mia Examples/utils/kegg_pathways_utils.r @@ -0,0 +1,60 @@ +# Retrieve name and URL for a specific pathway in the KEGG database +get_pathway_info <- function(pathway) { + pathway <- paste("map", pathway, sep = "") + pathway_name <- keggList(pathway)[[1]] + pathway_url <- paste("https://www.kegg.jp/pathway/", pathway, sep = "") + return(list(pathway_name = pathway_name, pathway_url = pathway_url)) +} + + +# Function to prompt users for pathway selection and return custom pathway IDs +PathwaysSelection <- function() { + display_markdown("#### Pathways Selection :\n\n +- For the most general & most complete pathways, input 'G'\n\n +- Press Enter to generate the most complete pathways\n\n +- To add custom pathways, input pathway numbers (ex: 00053,01220)") + + flush.console() + CUSTOM_PATHWAY_IDS <- get_variable_from_link_or_input('CUSTOM_PATHWAY_IDS', name = 'Pathways Accession', default = '') + + if (CUSTOM_PATHWAY_IDS == "") { + CUSTOM_PATHWAY_IDS <- list() + } else if (CUSTOM_PATHWAY_IDS == "G") { + CUSTOM_PATHWAY_IDS <- list("00010", "00020", "00030", "00061", "01232","00240", "00190") + } else { + CUSTOM_PATHWAY_IDS <- strsplit(CUSTOM_PATHWAY_IDS, ",")[[1]] + } + + message(if (length(CUSTOM_PATHWAY_IDS) > 0) { + paste("\nUsing", CUSTOM_PATHWAY_IDS, " - ", sapply(CUSTOM_PATHWAY_IDS, function(id) paste(get_pathway_info(id)[1]," : ",get_pathway_info(id)[2])), "as a Custom Pathway") + } else { + "\nUsing NONE as a Custom Pathway" + }) + return(CUSTOM_PATHWAY_IDS) +} + + +# Clearing the current working directory and displaying generated figures from `pathway_plots/` directory +generatePathwayPlots <- function() { +# Clearing the current working directory + if (!dir.exists("pathway_plots")) { + dir.create("pathway_plots") + } + + file.copy(from = list.files(pattern = "./*pathview.png"), to = "./pathway_plots/", overwrite = TRUE) + + png_files <- list.files(path = ".", pattern = "*.png") + xml_files <- list.files(path = ".", pattern = "*.xml") + files <- c(png_files, xml_files) + output <- capture.output({ + unlink(files) + }) + +# Accessing the png files and displaying it + images <- list.files("pathway_plots", full.names = TRUE) + + for (pathway in images) { + display_markdown(get_pathway_info(gsub("[^0-9]", "", basename(pathway)))$pathway_name) + display_png(file = pathway) + } +} \ No newline at end of file diff --git a/src/notebooks/R Mia Examples/utils/variable_utils.r b/src/notebooks/R Mia Examples/utils/variable_utils.r new file mode 100644 index 0000000..f6c2cc9 --- /dev/null +++ b/src/notebooks/R Mia Examples/utils/variable_utils.r @@ -0,0 +1,16 @@ +library(glue) + +get_variable_from_link_or_input <- function(variable, name = 'accession', default = NA) { + # Get a variable value, either from an ENV VAR that would have been set by the jlab_query_params extension, or through direct user input. + var <- Sys.getenv(variable, unset = NA) + if (!is.na(var)) { + print(glue('Using {name} = {var} from the link you followed.')) + } else { + determiner <- ifelse(grepl(tolower(substr(name, 0, 1)), 'aeiou'), 'an', 'a') + var <- readline(prompt = glue("Type {determiner} {name} [default: {default}]")) + } + var <- ifelse(is.na(var) || var == '', default, var) + print(glue('Using "{var}" as {name}')) + var +} +