diff --git a/.gitignore b/.gitignore
index 1ad710f..aaed6f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,10 +16,16 @@ src/docs/*.html
src/*.html
src/notebooks/**/*.html
src/*-listing.json
+renv/
+renv.lock
+.Rhistory
*.parquet
!**/example-data/**/*.parquet
*.sig
ko*.pathview.png
src/notebooks/R Examples/*.tsv
-src/notebooks/R Examples/*.txt
\ No newline at end of file
+src/notebooks/R Examples/*.txt
+.Rproj.user
+.RData
+.Rprofile
diff --git a/src/notebooks/R Mia Examples/Comparative-Metagenomics.ipynb b/src/notebooks/R Mia Examples/Comparative-Metagenomics.ipynb
new file mode 100644
index 0000000..f299d81
--- /dev/null
+++ b/src/notebooks/R Mia Examples/Comparative-Metagenomics.ipynb
@@ -0,0 +1,902 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "title: \"Comparative Metagenomics\"\n",
+ "author:\n",
+ " - name: Sam Hillman\n",
+ " affiliation:\n",
+ " - id: mia\n",
+ " name: Department of Computing, University of Turku, Finland\n",
+ "toc: true\n",
+ "number-sections: true\n",
+ "categories: [R]\n",
+ "execute: \n",
+ " eval: true\n",
+ "---\n",
+ "\n",
+ "::: {style=\"max-width:1200px\"}\n",
+ "![](./../_resources/mgnify_logo.png)\n",
+ "\n",
+ "# Comparative metagenomics\n",
+ "# Normalization methods, alpha & beta diversity, and differentially abundant taxa\n",
+ "\n",
+ "In this notebook we aim to demonstrate how the MGnifyR tool can be used to fetch \n",
+ "data and metadata of a MGnify metagenomic analysis \n",
+ "\n",
+ "Then we show how to generate diversity metrics for comparative metagenomics using\n",
+ "taxonomic profiles. \n",
+ "\n",
+ "Finally, we use the `mikropml` package to detect differentially abundant taxa and\n",
+ "to build an ML classification model. \n",
+ "\n",
+ "It's important to note, we're using methods and protocols that align with the \n",
+ "[mia](https://microbiome.github.io/mia/) package, which can be read about further \n",
+ "in the [OMA](https://microbiome.github.io/OMA/docs/devel/) guides.\n",
+ "\n",
+ "[MGnifyR](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html) \n",
+ "is a library that provides a set of tools for easily accessing and processing \n",
+ "MGnify data in R, making queries to MGnify databases through the [MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1/). \n",
+ "\n",
+ "The benefit of MGnifyR is that data can either be fetched in tsv format or be directly \n",
+ "combined in a [TreeSummarizedExperiment](https://www.bioconductor.org/packages/release/bioc/vignettes/TreeSummarizedExperiment/inst/doc/Introduction_to_treeSummarizedExperiment.html)\n",
+ "object to run an analysis in a custom workflow.\n",
+ "\n",
+ "`TreeSummarizedExperiment` is supported by the [mia bioconductor package](https://microbiome.github.io/mia/), \n",
+ "which is dedicated to microbiome analysis.\n",
+ "\n",
+ "This is an interactive code notebook (a Jupyter Notebook). To run this code, \n",
+ "click into each cell and press the ▶ button in the top toolbar, or press `shift+enter`.\n",
+ "\n",
+ "------------------------------------------------------------------------\n",
+ "\n",
+ "```{r}\n",
+ "# Loading libraries\n",
+ "suppressMessages({\n",
+ " library(ggplot2)\n",
+ " library(MGnifyR)\n",
+ " library(tidyverse)\n",
+ " library(vegan)\n",
+ " library(mia)\n",
+ " library(miaViz)\n",
+ " library(biomformat)\n",
+ " library(scater)\n",
+ " library(metagenomeSeq)\n",
+ " library(reshape2)\n",
+ " library(viridis)\n",
+ " library(patchwork)\n",
+ " library(IRdisplay)\n",
+ " library(ALDEx2)\n",
+ " library(ANCOMBC)\n",
+ " library(mikropml)\n",
+ " library(MLeval)\n",
+ " library(caret)\n",
+ " library(scales)\n",
+ "})\n",
+ "\n",
+ "display_markdown(file = '../_resources/mgnifyr_help.md')\n",
+ "```\n",
+ "\n",
+ "# Fetching data and preprocessing\n",
+ "In this example we are going to fetch MGnify analysis results and metadata for a \n",
+ "dataset on the human gut microbiome viewed across age and geography - \n",
+ "([MGYS00005154](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005154)).\n",
+ "\n",
+ "## Setting the study accession\n",
+ "1. First, we need to specify the accession number of the study we're working with. \n",
+ "This can be done by setting the `mgnify_study_accession` variable. The accession \n",
+ "number uniquely identifies the study in the MGnify database.\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "\n",
+ "source(\"./utils/variable_utils.r\")\n",
+ "\n",
+ "mgnify_study_accession <- get_variable_from_link_or_input('MGYS', 'Study Accession', \n",
+ " 'MGYS00005154')\n",
+ "\n",
+ "# You can also just directly set the accession variable in code, like this:\n",
+ "# mgnify_study_accession <- \"MGYS00005292\"\n",
+ "```\n",
+ "\n",
+ "2. To interact with the MGnify database, we need to create an MgnifyClient object. \n",
+ "This object allows us to fetch data from MGnify, and we can configure it to use \n",
+ "a cache for efficiency. \n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "# Check if the cache directory exists, if not, create it\n",
+ "if (!dir.exists(\"./.mgnify_cache\")) {\n",
+ " dir.create(\"./.mgnify_cache\", recursive = TRUE)\n",
+ "}\n",
+ "\n",
+ "# Create the MgnifyClient object with caching enabled\n",
+ "mg <- MgnifyClient(usecache = TRUE, cache_dir = \"./.mgnify_cache\")\n",
+ "```\n",
+ "\n",
+ "3. Using the MgnifyClient object, we can search for all analyses associated with \n",
+ "the study accession number we set earlier. This will return a list of analysis \n",
+ "accession numbers.\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "analyses_accessions <- searchAnalysis(mg, \"studies\", mgnify_study_accession)\n",
+ "analyses_accessions\n",
+ "```\n",
+ "\n",
+ "## Download metadata\n",
+ "1. First, download the metadata for the first 20 Analyses and put it into a dataframe.\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "analyses_metadata_df <- getMetadata(mg, head(analyses_accessions, 30))\n",
+ "```\n",
+ "\n",
+ "2. The table could be big, so let's look at a sample of it (`head`).\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "t(head(analyses_metadata_df))\n",
+ "```\n",
+ "\n",
+ "## Convert to `TreeSummarizedExperiment` object.\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "tse <- getResult(mg, analyses_metadata_df$analysis_accession)\n",
+ "```\n",
+ "\n",
+ "# Normalization, alpha diversity indices and taxonomic profiles visualization\n",
+ "\n",
+ "Let's Show some stats on sequencing depth across samples.\n",
+ "\n",
+ "```{r}\n",
+ "# Calculate and print max difference\n",
+ "max_difference <- max(tse$sum) / min(tse$sum)\n",
+ "sprintf(\"The max difference in sequencing depth is %s\", max_difference)\n",
+ "```\n",
+ "\n",
+ "An approximate 4.5-fold difference in the library sizes means that we will need \n",
+ "to apply a normalization method before continuing with the analysis. The most \n",
+ "common normalization methods used in microbiome count data are proportions and rarefaction. \n",
+ "\n",
+ "However, other methods originally developed to normalize RNA-seq counts have been \n",
+ "adapted to differential-abundance analysis in microbiome data. A discussion about \n",
+ "how to choose the right normalization method is out of the scope of this material, \n",
+ "but the topic has been covered in multiple forums and scientific publications. \n",
+ "\n",
+ "Depending on the downstream analysis we intend to perform, different methods might \n",
+ "be appropriate. For instance, to compare groups of samples at community-level \n",
+ "through beta-diversity, “…proportions and rarefying produced more accurate comparisons \n",
+ "among communities and are the only methods that fully normalized read depths across \n",
+ "samples.\n",
+ "\n",
+ "Additionally, upper quartile, cumulative sum scaling (CSS), edgeR-TMM, and DESeq-VS \n",
+ "often masked differences among communities when common OTUs differed, and they \n",
+ "produced false positives when rare OTUs differed” ([1](#ref-mcknight2019)). On \n",
+ "the other hand, for detection of differentially abundant species, “both proportions \n",
+ "and rarefied counts result in a high rate of false positives in tests for species \n",
+ "that are differentially abundant across sample classes” ([2](#ref-mcmurdie2014)).\n",
+ "\n",
+ "However, rarefaction can utilize information in the full read count data, and \n",
+ "the debate on the pros and cons of rarefaction is ongoing ([3](#ref-schloss2024a), \n",
+ "[4](#ref-schloss2024b)).\n",
+ "\n",
+ "In the following examples we will show two popular ways of normalization: \n",
+ "relative abundance transformation (transforming features into zero mean, unit variance - \n",
+ "also called relative abundance transformation) and rarefaction.\n",
+ "\n",
+ "## Normalization by total sum scaling (TSS, relative abundance or proportions)\n",
+ "The simplest way to normalize the differences in sample size is to do a relative \n",
+ "abundance transformation on the OTU counts table by dividing by the number of \n",
+ "total reads of each sample. This type of normalization is also referred to as \n",
+ "relative abundance or proportions. \n",
+ "\n",
+ "We use this normalization to compare taxonomic profiles, while alpha diversity \n",
+ "indices are computed on the non-normalized matrix. The reason to do so is that \n",
+ "we need a matrix of integer numbers as input.\n",
+ "\n",
+ "1. Apply the relative abundance transformation to raw counts matrix.\n",
+ "\n",
+ "```{r}\n",
+ "tse <- transformAssay(tse, name = \"relabundance\", method = \"relabundance\")\n",
+ "```\n",
+ "\n",
+ "2. Agglomerate taxonomy at Class rank and save to the `glom_tse_relab` object \n",
+ "within `altExp`. Then remove all assays other than `relabundance` for simplicity.\n",
+ "\n",
+ "```{r}\n",
+ "# Agglomerate at class rank\n",
+ "altExp(tse, \"glom_tse_relab\") <- agglomerateByRank(tse, rank = \"Class\")\n",
+ "\n",
+ "# Remove assays other than relabundance\n",
+ "assays(altExp(tse, \"glom_tse_relab\")) <- assays(altExp(tse, \"glom_tse_relab\"))[\"relabundance\"]\n",
+ "```\n",
+ "\n",
+ "3. Get the top 15 features of the `relabundance` assay and subset the `altExp` \n",
+ "`relabundance` assay by those top 15 features.\n",
+ "\n",
+ "```{r}\n",
+ "# Take the top 15 features based on their means\n",
+ "top15_relabundance <- getTop(altExp(tse, \"glom_tse_relab\"), method = \"mean\", \n",
+ " assay.type = \"relabundance\", top = 15L)\n",
+ "\n",
+ "# Subset relabundance tse by top features\n",
+ "altExp(tse, \"glom_tse_relab\") <- altExp(tse, \"glom_tse_relab\")[top15_tss, ]\n",
+ "```\n",
+ "\n",
+ "## Normalization by subsampling (rarefaction)\n",
+ "Rarefaction is an alternative to doing a relative abundance transformation to \n",
+ "obtain an adjusted OTUs count matrix. The method is based on a process of subsampling \n",
+ "to the smallest library size in the data set. \n",
+ "\n",
+ "The algorithm randomly removes reads until the samples reach the same library size. \n",
+ "Despite the apparent disadvantage of discarding information from the larger samples, \n",
+ "rarefaction is quite popular in microbial ecology. \n",
+ "\n",
+ "The first step is to find the smallest sample size. We can use the number of \n",
+ "observed OTUs in the original matrix to do so.\n",
+ "\n",
+ "1. Find the smallest sample size.\n",
+ "\n",
+ "```{r}\n",
+ "min_size <- min(colSums2(assay(tse, \"counts\")))\n",
+ "```\n",
+ "\n",
+ "2. Rarefying to the smallest sample.\n",
+ "\n",
+ "```{r}\n",
+ "tse <- rarefyAssay(tse, assay.type = \"counts\", size = min_size, name = \"rarefy\")\n",
+ "```\n",
+ "\n",
+ "3. Agglomerate taxonomy at Class rank and save to the `glom_tse` object within `altExp`. \n",
+ "Then remove all assays other than `rarefy` for simplicity.\n",
+ "\n",
+ "```{r}\n",
+ "# Agglomerate at class rank\n",
+ "altExp(tse, \"glom_tse_rarefy\") <- agglomerateByRank(tse, rank = \"Class\")\n",
+ "\n",
+ "# Remove assays other than rarefy\n",
+ "assays(altExp(tse, \"glom_tse_rarefy\")) <- assays(altExp(tse, \"glom_tse_rarefy\"))[\"rarefy\"]\n",
+ "```\n",
+ "\n",
+ "\n",
+ "4. Get the top 15 features of the `rarefy` assay and subset the `altExp`\n",
+ "`rarefy` assay by those top 15 features. \n",
+ "\n",
+ "```{r}\n",
+ "# Get the top features\n",
+ "top15_rarefy <- getTop(altExp(tse, \"glom_tse_rarefy\"), top = 15L, \n",
+ " assay.type = \"rarefy\", method = \"mean\")\n",
+ "\n",
+ "# Subset original rarefy tse by top features\n",
+ "altExp(tse, \"glom_tse_rarefy\") <- altExp(tse, \"glom_tse_rarefy\")[top15_rarefy, ]\n",
+ "```\n",
+ "\n",
+ "## OTU Filtering\n",
+ "\n",
+ "1. First, identify the total counts per sample and add them to the colData of our \n",
+ "`tse` object.\n",
+ "\n",
+ "```{r}\n",
+ "tse <- addPerCellQC(tse)\n",
+ "```\n",
+ "\n",
+ "2. We then plot a histogram to identify outliers, so we can remove samples with \n",
+ "low coverage.\n",
+ "\n",
+ "```{r}\n",
+ "scatter_plot <- plotColData(tse, y = \"sum\") +\n",
+ " scale_y_log10(\n",
+ " breaks = trans_breaks(\"log10\", function(x) 10^x, n = 8),\n",
+ " labels = trans_format(\"log10\", math_format(10^.x))\n",
+ " ) +\n",
+ " labs(x = \"Samples\", y = \"Library size (log scale)\") \n",
+ "```\n",
+ "\n",
+ "3. We can see that samples with number of reads above 10^6.2 seem to be outliers. \n",
+ "Let’s filter out the outliers and plot a new histogram.\n",
+ "\n",
+ "At the same time, we can also filter out singletons. Singletons are OTUs of size \n",
+ "one, meaning that only one read was assigned to that OTU. These very low-abundance \n",
+ "OTUs could be biologically real (belonging to the rare biosphere ([5](#ref-lynch2015))), \n",
+ "or they could be false positives due to sequencing artefacts. Singletons observed \n",
+ "in only one sample are more likely to be artefacts, and it is good practice to \n",
+ "remove them from the OTUs counts table to avoid artificially over-estimating the \n",
+ "OTUs richness. You can find more discussion about this in [Robert Edgar’s blog](https://drive5.com/usearch/manual/singletons.html).\n",
+ "\n",
+ "```{r}\n",
+ "# Subset the tse object to keep only samples with library sizes > 10^6.2 \n",
+ "# and non-singleton taxa in the filtered data\n",
+ "tse <- tse[rowSums(assay(tse, \"counts\")) > 1, tse$sum < 10^6.2]\n",
+ "\n",
+ "# Now we redo the scatter plot\n",
+ "filtered_scatter_plot <- plotColData(tse, y = \"sum\") +\n",
+ " scale_y_log10(\n",
+ " breaks = trans_breaks(\"log10\", function(x) 10^x, n = 8),\n",
+ " labels = trans_format(\"log10\", math_format(10^.x))\n",
+ " ) +\n",
+ " labs(x = \"Samples\", y = \"Library size (log scale)\") \n",
+ "```\n",
+ "\n",
+ "## Computing alpha diversity indices\n",
+ "Alpha diversity is a measure of species diversity in a particular area or an ecosystem. \n",
+ "In terms of microbial ecology, analyzing the sample diversity of sequencing data \n",
+ "is commonly the first step in assessing differences between microbial environments. \n",
+ "Some of the most popular alpha diversity indices reported in microbial community \n",
+ "analyses are Chao, abundance-based coverage estimators (ACE), Simpson, and Shannon-Weaver.\n",
+ "\n",
+ "Chao1 and ACE are nonparametric estimators that describe diversity as the max \n",
+ "number of expected species in the sample (species richness). They consider the \n",
+ "proportion of species that have been observed before (“recaptured”) to those that \n",
+ "are observed only once. Both Chao1 and ACE underestimate true richness at low sample sizes.\n",
+ "\n",
+ "On the other hand, Simpson and Shannon-Weaver, consider relative abundances and \n",
+ "depends on both, species richness and the evenness (or equitableness), with which \n",
+ "individuals are distributed among the different species. However, both metrics have \n",
+ "specific biases. The Shannon-Weaver index places a greater weight on species richness, \n",
+ "whereas the Simpson index considers evenness more than richness in its measurement.\n",
+ "\n",
+ "More discussion and examples illustrating how sample size and normalization methods \n",
+ "can affect alpha diversity metrics in references [6](#ref-willis2019) and [7](#ref-kim2017).\n",
+ "\n",
+ "In this example, we're going to calculate the diversity measures for our original \n",
+ "counts data, and for our rarefied original counts data.\n",
+ "\n",
+ "1. Now, let's use the `addAlpha` method to compute the alpha diversity indices \n",
+ "on original raw counts data, and we use the `niter` argument to rarefy our original \n",
+ "counts data to calculate the shannon diversity for rarefied data. We also add a \n",
+ "`SampleID` colData column to `tse` to help with out plotting too. \n",
+ "\n",
+ "For this example we'll use the `shannon_diversity` index to show alpha diversity, \n",
+ "though [see here](https://microbiome.github.io/mia/reference/addAlpha.html) for \n",
+ "more index options.\n",
+ "\n",
+ "```{r}\n",
+ "# Add SampleID to colData\n",
+ "tse$SampleID <- colnames(tse)\n",
+ "\n",
+ "# Compute alpha diversity on normal counts data\n",
+ "tse <- addAlpha(tse, index = \"shannon_diversity\", name = \"shannon_diversity_normal\")\n",
+ "\n",
+ "# Compute alpha diversity on rarefied data\n",
+ "tse <- addAlpha(tse, index = \"shannon_diversity\", name = \"shannon_diversity_rarefy\", \n",
+ " niter = 10)\n",
+ "```\n",
+ "\n",
+ "2. Plot alpha diversity with `plotColData`, generating a plot for our shannon \n",
+ "diversity calculation on both our original counts data and rarefied data. We save \n",
+ "both plots to `plots`.\n",
+ "\n",
+ "```{r}\n",
+ "# Use lapply to generate plots\n",
+ "plots <- lapply(c(\"shannon_diversity_normal\", \"shannon_diversity_rarefy\"), function(y) {\n",
+ " label <- ifelse(y == \"shannon_diversity_normal\", \"normal\", \"rarefy\")\n",
+ " plotColData(tse, x = \"SampleID\", y = y, colour_by = \"SampleID\") +\n",
+ " labs(y = paste(\"Shannon Diversity Index (\", label, \")\", sep = \"\")) +\n",
+ " theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size = 6))\n",
+ "})\n",
+ "```\n",
+ "\n",
+ "3. Combine all plots using patchwork.\n",
+ "\n",
+ "```{r}\n",
+ "# Combine all plots using patchwork\n",
+ "wrapped_plots <- wrap_plots(plots) +\n",
+ " plot_layout(guides = \"collect\") + \n",
+ " plot_annotation(title = \"Shannon Diversity Calculations\",\n",
+ " theme = theme(plot.title = element_text(size = 12, hjust = 0.5)))\n",
+ "```\n",
+ "\n",
+ "## Community profile visualization\n",
+ "Here we visualise the taxonomic profiles in barplots at class ranks for our top \n",
+ "tss normalized taxa and rarefied taxa.\n",
+ "\n",
+ "1. Let's store our `glom_tse_relab` and `glom_tse_rarefy` objects in \n",
+ "variables to make referencing them easier.\n",
+ "\n",
+ "```{r}\n",
+ "glom_tse_relab <- altExp(tse, \"glom_tse_relab\")\n",
+ "glom_tse_rarefy <- altExp(tse, \"glom_tse_rarefy\")\n",
+ "```\n",
+ "\n",
+ "\n",
+ "2. Lets change any unknown taxonomic rowData information to \"Unknown\" from \"NA\" \n",
+ "in our `tse` objects.\n",
+ "\n",
+ "```{r}\n",
+ "rowData(glom_tse_relab)$Class <- ifelse(is.na(rowData(glom_tse_relab)$Class), \"Unknown\", \n",
+ " rowData(glom_tse_relab)$Class)\n",
+ "rowData(glom_tse_rarefy)$Class <- ifelse(is.na(rowData(glom_tse_rarefy)$Class), \"Unknown\", \n",
+ " rowData(glom_tse_rarefy)$Class)\n",
+ "```\n",
+ "\n",
+ "3. Create the plots with the `plotAbundance` `mia` method. We use our agglomerated \n",
+ "`relabundance` and `rarefy` assays from our `tse` objects to do this.\n",
+ "\n",
+ "```{r}\n",
+ "# Plotting our tss data\n",
+ "relabundance_plot <- plotAbundance(glom_tse_relab,\n",
+ " assay.type = \"relabundance\",\n",
+ " rank = \"Class\",\n",
+ " order_rank_by = \"abund\")\n",
+ "\n",
+ "# Plotting our rarefied data\n",
+ "rarefy_plot <- plotAbundance(glom_tse_rarefy,\n",
+ " assay.type = \"rarefy\",\n",
+ " rank = \"Class\",\n",
+ " order_rank_by = \"abund\")\n",
+ "```\n",
+ "\n",
+ "By naked eye, we can see that the profiles change. Let’s take a deeper look at \n",
+ "some interesting Classes to check how the abundance change depending on the \n",
+ "normalization method.\n",
+ "\n",
+ "1. Define our subset classes\n",
+ "\n",
+ "```{r}\n",
+ "subset_classes <- c(\"Betaproteobacteria\", \n",
+ " \"Bacteroidia\", \n",
+ " \"Actinobacteria\")\n",
+ "```\n",
+ "\n",
+ "\n",
+ "2. Subset our `tse` objects.\n",
+ "\n",
+ "```{r}\n",
+ "glom_tse_relab <- glom_tse_relab[rowData(glom_tse_relab)$Class %in% subset_classes, ]\n",
+ "glom_tse_rarefy <- glom_tse_rarefy[rowData(glom_tse_rarefy)$Class %in% subset_classes, ]\n",
+ "```\n",
+ "\n",
+ "3. Plot our new plots with the `plotAbundance` `mia` method.\n",
+ "\n",
+ "```{r}\n",
+ "relabundance_plot <- plotAbundance(glom_tse_relab, \n",
+ " assay.type = \"relabundance\", \n",
+ " rank = \"Class\", \n",
+ " order_rank_by = \"abund\")\n",
+ "\n",
+ "rarefy_plot <- plotAbundance(glom_tse_rarefy, \n",
+ " assay.type = \"rarefy\", \n",
+ " rank = \"Class\", \n",
+ " order_rank_by = \"abund\")\n",
+ "```\n",
+ "\n",
+ "## Comparative metagenomics at community level: Beta diversity\n",
+ "\n",
+ "Beta (β)-diversity is used in ecology to assess the heterogeneity of local communities. \n",
+ "It is defined as the ratio between regional and local species diversity. In simpler \n",
+ "terms, it calculates the number of species that are not the same in two different \n",
+ "environments.\n",
+ "\n",
+ "Some of the most popular beta diversity measures in microbiome research include \n",
+ "Bray-Curtis index (for compositional data), Jaccard index (for presence/absence \n",
+ "data, ignoring abundance information), Aitchison distance (Euclidean distance \n",
+ "for transformed abundances, aiming to avoid the compositionality bias), and the \n",
+ "Unifrac distances (that use the phylogenetic tree information). \n",
+ "\n",
+ "Only some of the commonly used beta diversity measures are actual distances, \n",
+ "therefore, the term dissimilarity or beta-diversity is commonly used. The workflow \n",
+ "to assess β-diversity between groups of samples starts with calculating the diversity \n",
+ "indices for each pair of samples. This results in a distance (or dissimilarity) \n",
+ "matrix that is often used for ordination (e.g., principal coordinates analysis) \n",
+ "and data exploration in microbiota data analysis ([8](#ref-wagner2018)).\n",
+ "\n",
+ "For statistical analysis, [adonis](https://www.rdocumentation.org/packages/vegan/versions/2.6-4/topics/adonis) \n",
+ "and [betadisper](https://www.rdocumentation.org/packages/vegan/versions/2.6-4/topics/betadisper) \n",
+ "methods are widely used together in β-diversity analyses.\n",
+ "\n",
+ "PERMANOVA tests whether two or more groups have different compositions. PERMANOVA \n",
+ "calculates the squared deviations of each site to the centroid and then, performs \n",
+ "significance tests using F-tests based on sequential sums of squares from \n",
+ "permutations of the raw data. It can be seen as an ANOVA using distance matrices \n",
+ "(analogous to MANOVA). A non significant p-value (p>0.05), means that there’s no \n",
+ "significant difference in composition between groups (we accept the null hypothesis \n",
+ "of no significant difference in composition between groups).\n",
+ "\n",
+ "Betadisper tests if two or more groups are homogeneously dispersed in relation to \n",
+ "their species in studied samples. This test can be done to see if one group has \n",
+ "more compositional variance than another. The method use the distance matrix to \n",
+ "calculate the multivariate dispersions (variances; average distance to centroids). \n",
+ "\n",
+ "Then we use group dispersions to perform an ANOVA test. ANOVA’s p-value not significant \n",
+ "(p>0.05) means that group dispersions are homogenous (null hypothesis of no difference \n",
+ "in dispersion between groups).\n",
+ "\n",
+ "Compute beta diversity using various methods to calculate distance, and perform \n",
+ "principle components analysis ploting the first two axes. According to Pereira \n",
+ "et al., (2018)([9](#ref-pereira2018)), the best normalization method for metagenomic \n",
+ "gene abundance is CSS for large group sizes. For this reason, we will use this \n",
+ "method to show beta-diversity. \n",
+ "\n",
+ "Using as a guide the steps described in ([10](#ref-mcmurdie2018)), we will create \n",
+ "a list of suitable distance methods, iterate through them, and display a combined \n",
+ "plot. For a better visualization we are going to show the 95% confidence region \n",
+ "with an ellipse.\n",
+ "\n",
+ "1. Let's define the list of methods we're going to test.\n",
+ "\n",
+ "```{r}\n",
+ "# Define method list\n",
+ "vegdist_methods <- c(\"manhattan\", \"euclidean\", \"bray\", \"jaccard\",\n",
+ " \"gower\", \"chao\")\n",
+ "\n",
+ "# Create empty vectors to hold method names and plots\n",
+ "created <- c()\n",
+ "plots <- c()\n",
+ "```\n",
+ "\n",
+ "2. Loop through these method types and plot abundance for each, saving each plot \n",
+ "into the `plots` vector.\n",
+ "\n",
+ "```{r}\n",
+ "# Loop through these methods and plot abundance\n",
+ "for (method in vegdist_methods) {\n",
+ " # Set the method name\n",
+ " method_name <- paste(\"MDS\", method, sep = \"_\")\n",
+ " created <- method_name\n",
+ " \n",
+ " # Run MDS \n",
+ " mds_tse <- runMDS(tse,\n",
+ " FUN = vegan::vegdist,\n",
+ " method = method,\n",
+ " assay.type = \"relabundance\",\n",
+ " name = method_name)\n",
+ " \n",
+ " # Create plot object of reducedDim slot\n",
+ " p <- plotReducedDim(mds_tse, dimred = paste(\"MDS\", method, sep = \"_\"))\n",
+ " \n",
+ " # Calculate explained variance\n",
+ " e <- attr(reducedDim(mds_tse, method_name), \"eig\")\n",
+ " rel_eig <- e / sum(e[e > 0])\n",
+ " \n",
+ " # Add explained variance for each axis\n",
+ " p <- p + labs(x = paste(\"MDS_1_\", method, \" (\", round(100 * rel_eig[1], 1), \"%)\", \n",
+ " sep = \"\"),\n",
+ " y = paste(\"MDS_2_\", method, \" (\", round(100 * rel_eig[2], 1), \"%)\", \n",
+ " sep = \"\"),\n",
+ " title = paste(\"MDS using distance method\", method)) +\n",
+ " theme_minimal(base_size = 6) + # Use a minimal theme for clarity\n",
+ " theme(\n",
+ " plot.title = element_text(hjust = 0.5), # Center the title\n",
+ " axis.title = element_text(size = 6), # Smaller axis titles\n",
+ " axis.text = element_text(size = 6), # Smaller axis labels\n",
+ " legend.position = \"bottom\", # Position the legend at the bottom\n",
+ " legend.title = element_text(size = 6), # Smaller legend title\n",
+ " legend.text = element_text(size = 6) # Smaller legend text\n",
+ " )\n",
+ " \n",
+ " # Save plot\n",
+ " plots[[method_name]] <- p\n",
+ "}\n",
+ "```\n",
+ "\n",
+ "3. Combine all plots using patchwork.\n",
+ "\n",
+ "```{r}\n",
+ "# Combine all plots using patchwork\n",
+ "wrapped_plots <- wrap_plots(plots) +\n",
+ " plot_layout(guides = \"collect\") + \n",
+ " plot_annotation(title = \"MDS on Various Distance Metrics\",\n",
+ " theme = theme(plot.title = element_text(size = 14, hjust = 0.5)))\n",
+ "```\n",
+ "\n",
+ "Separation of the groups observed in some of the ordination plots could be due to \n",
+ "a location effect, dispersion effect, or both. To test the location effect (null \n",
+ "hypothesis of no difference in composition between groups) we use a permanova test, \n",
+ "using the bray method to calculate the distance matrix. \n",
+ "\n",
+ "PERMANOVA assumes there is homogeneity of dispersion among groups. We want to also \n",
+ "test this assumption, to check whether the differences detected by our permanova \n",
+ "test are due to variation in dispersion of the data. \n",
+ "\n",
+ "We do this using the betadisper method to calculate variances and then evaluate \n",
+ "if there’s a significant variation between groups through an anova.\n",
+ "\n",
+ "To do all these tests, we use the `addRDA` method, which will do all of them for us.\n",
+ "\n",
+ "```{r}\n",
+ "beta_diversity_tse <- addRDA(\n",
+ " tse,\n",
+ " formula = data ~ sample_geographic.location..country.and.or.sea.region.,\n",
+ " assay.type = \"relabundance\",\n",
+ " method = \"bray\",\n",
+ " test.signif = TRUE,\n",
+ " homogeneity.test = \"anova\",\n",
+ " full = TRUE\n",
+ ")\n",
+ "```\n",
+ "\n",
+ "We now plot and fetch the results\n",
+ "\n",
+ "```{r}\n",
+ "# Plot the results\n",
+ "plotReducedDim(beta_diversity_tse, \"RDA\", \n",
+ " colour_by = \"sample_geographic.location..country.and.or.sea.region.\")\n",
+ "\n",
+ "# Fetch significance results, including PERMANOVA and betadisper (homogeneity) results\n",
+ "significance_results <- attr(reducedDim(beta_diversity_tse, \"RDA\"), \"significance\")\n",
+ "significance_results\n",
+ "```\n",
+ "\n",
+ "ANOVA's p-value is not significant (p>0.05), meaning that there's no significant \n",
+ "difference in dispersion among groups. The PERMANOVA however showed a significant \n",
+ "P value (p<0.05), meaning significant differences in community composition between \n",
+ "the groups. \n",
+ "\n",
+ "## Detection of differentially abundant taxa\n",
+ "There are many approaches out there to detect differentially abundant taxa (or genes).\n",
+ "Here we detect microbiome community composition changes due to various environmental \n",
+ "factors. Describing such associations in quantitative models has multiple applications, \n",
+ "such as in predicting disease status based on microbiome data. \n",
+ "\n",
+ "In addition, the pipeline includes the generation of high-quality plots for visual \n",
+ "inspection of results ([11](#ref-wirbel2021)). There are multiple parameters that \n",
+ "can be adjusted for better performance in each step of the analysis. However, in \n",
+ "this exercise we are using default arguments with minimal tuning.\n",
+ "\n",
+ "### Association testing\n",
+ "\n",
+ "1. First we use our original counts data on our `tse` object and agglomerate \n",
+ "by rank to a new `genus_tse` object.\n",
+ "\n",
+ "```{r}\n",
+ "genus_tse <- agglomerateByRank(tse, rank = \"Genus\")\n",
+ "```\n",
+ "\n",
+ "2. Subset the dataset by prevalence to include only the taxa that are present in \n",
+ "at least 25% of the samples.\n",
+ "\n",
+ "```{r}\n",
+ "genus_tse <- subsetByPrevalent(genus_tse, prevalence = 0.25, include.lowest = TRUE)\n",
+ "```\n",
+ "\n",
+ "3. Subset our `genus_tse` samples into 2 geographic groups, and add an extra `colData` \n",
+ "column to accommodate this.\n",
+ "\n",
+ "```{r}\n",
+ "# Create a new condition vector\n",
+ "conditions <- ifelse(genus_tse$sample_geographic.location..country.and.or.sea.region. \n",
+ " == \"GAZ:United States of America\", \"USA\", \"Other\")\n",
+ "\n",
+ "# Add conditions as a new column in the colData of genus_tse\n",
+ "genus_tse$geo_location <- conditions\n",
+ "```\n",
+ "\n",
+ "4. Now we use the `ALDEx2` package to compute the differential abundance analysis.\n",
+ "\n",
+ "```{r}\n",
+ "# Set seed for reproducibility\n",
+ "set.seed(123)\n",
+ "\n",
+ "# Generate Monte Carlo samples of the Dirichlet distribution for each sample using \n",
+ "# the geographic location for grouping\n",
+ "x <- aldex.clr(assay(genus_tse, \"counts\"), genus_tse$geo_location)\n",
+ "\n",
+ "# Calculates expected values of the Welch's t-test and Wilcoxon rank\n",
+ "x_tt <- aldex.ttest(x, paired.test = FALSE, verbose = FALSE)\n",
+ "\n",
+ "# Determine the median clr abundance of the feature in all samples and in groups, \n",
+ "# the median difference between the two groups, the median variation within each \n",
+ "# group and the effect size - which is the median of the ration of the between \n",
+ "# group difference and the larger of the variance within groups\n",
+ "x_effect <- aldex.effect(x, CI = TRUE, verbose = FALSE)\n",
+ "\n",
+ "# Now combine all outputs into a data frame\n",
+ "aldex_out <- data.frame(x_tt, x_effect)\n",
+ "\n",
+ "# Now we plot a Bland-ALtman plot (left) and a dispersion plot (right)\n",
+ "par(mfrow = c(1, 2))\n",
+ "\n",
+ "# Set up a 1x2 plotting area\n",
+ "par(mfrow = c(1, 2))\n",
+ "\n",
+ "# Bland-Altman plot (left)\n",
+ "aldex.plot(aldex_out,\n",
+ " type = \"MA\",\n",
+ " test = \"welch\",\n",
+ " xlab = \"Log-ratio abundance\",\n",
+ " ylab = \"Difference\",\n",
+ " cutoff = 0.05)\n",
+ "\n",
+ "# Dispersion plot (right)\n",
+ "aldex.plot(aldex_out,\n",
+ " type = \"MW\",\n",
+ " test = \"welch\",\n",
+ " xlab = \"Dispersion\",\n",
+ " ylab = \"Difference\",\n",
+ " cutoff = 0.05)\n",
+ "\n",
+ "# Reset plotting layout\n",
+ "par(mfrow = c(1, 1))\n",
+ "\n",
+ "# Then do some further evaluation on the results\n",
+ "aldex_out %>%\n",
+ " rownames_to_column(var = \"Genus\") %>%\n",
+ " # here we choose the wilcoxon output rather than t-test output\n",
+ " filter(wi.eBH <= 0.05) %>%\n",
+ " dplyr::select(Genus, we.eBH, wi.eBH, effect, overlap) %>%\n",
+ " knitr::kable()\n",
+ "```\n",
+ "\n",
+ "## Confounder Testing \n",
+ "\n",
+ "Test the associated metadata variables for potential confounding influence. As many \n",
+ "biological and technical factors beyond the primary phenotype of interest can influence\n",
+ "microbiome composition, simple association studies may suffer confounding by other \n",
+ "variables, which can lead to spurious results. \n",
+ "\n",
+ "Associations are visualized either as barplot or Q-Q plot, depending on the type \n",
+ "of metadata. Additionally, it evaluates associations among metadata variables \n",
+ "using conditional entropy and associations with the label using generalized linear \n",
+ "models, producing a correlation heatmap and appropriate quantitative barplots, \n",
+ "respectively.\n",
+ "\n",
+ "Here we use the `ANCOMBC` package to test for cofounders.\n",
+ "\n",
+ "1. Compute and store the library size in `colData`.\n",
+ "\n",
+ "```{r}\n",
+ "genus_tse$library_size <- colSums(assay(genus_tse, \"counts\"))\n",
+ "```\n",
+ "\n",
+ "2. The `ancombc2_out` method performs the differential abundance analysis whilst \n",
+ "accounting for multiple covariates and potential confounders. \n",
+ "\n",
+ "```{r}\n",
+ "# Perform the analysis \n",
+ "ancombc2_out <- ancombc2(genus_tse,\n",
+ " assay.type = \"counts\",\n",
+ " fix_formula = \"sample_geographic.location..country.and.or.sea.region. + library_size\",\n",
+ " p_adj_method = \"fdr\",\n",
+ " lib_cut = 0,\n",
+ " group = \"sample_geographic.location..country.and.or.sea.region.\", \n",
+ " struc_zero = TRUE, \n",
+ " neg_lb = TRUE,\n",
+ " alpha = 0.05,\n",
+ " global = TRUE)\n",
+ "```\n",
+ "\n",
+ "## Generation of a predictive model: Machine learning workflow\n",
+ "\n",
+ "We'll now use the `mikropml` library to construct a machine learning meta-analysis\n",
+ "models on the basis of microbial markers. The steps to follow include functions \n",
+ "for data normalization, splitting the data into cross-validation folds, training \n",
+ "the model, and making predictions based on cross-validation instances and the trained \n",
+ "models. \n",
+ "\n",
+ "Such a model can be built on one dataset and then be applied on another, similarly \n",
+ "processed holdout dataset. This might be of interest when comparing data from two \n",
+ "different studies testing the same condition.\n",
+ "\n",
+ "We will work on our original full counts `tse` object.\n",
+ "\n",
+ "1. Normalize the data using CLR transformation.\n",
+ "\n",
+ "```{r}\n",
+ "tse <- transformAssay(tse, method = \"clr\", MARGIN = \"samples\", pseudocount = 1)\n",
+ "assay <- assay(tse, \"clr\")\n",
+ "```\n",
+ "\n",
+ "2. Prepare the cross-validation data frame. Cross-validation is a technique to \n",
+ "assess how well a machine learning model would generalize to external data by \n",
+ "partitioning the dataset into training and test sets. Here, we prepare the data \n",
+ "frame needed for cross-validation.\n",
+ "\n",
+ "```{r}\n",
+ "# Prepare the cross-validation data frame.\n",
+ "# Transpose the assay matrix to have samples as rows and features (taxa) as columns\n",
+ "assay <- t(assay)\n",
+ "\n",
+ "# Convert the transposed matrix to a data frame for easier manipulation and analysis\n",
+ "df <- as.data.frame(assay)\n",
+ "\n",
+ "# Extract the geographic location labels from the colData of the tse object\n",
+ "labels <- colData(tse)$sample_geographic.location..country.and.or.sea.region.\n",
+ "\n",
+ "# Simplify the labels to make them valid R variable names\n",
+ "labels <- gsub(\" \", \"_\", labels)\n",
+ "labels <- gsub(\"[:.]\", \"\", labels)\n",
+ "\n",
+ "# Convert the geographic location labels to a factor (categorical variable)\n",
+ "labels <- as.factor(labels)\n",
+ "\n",
+ "# Add the geographic location labels as a new column in the data frame\n",
+ "df$geo_location <- labels\n",
+ "```\n",
+ "\n",
+ "3. Train the model using `mikropml`. Model training involves selecting and applying \n",
+ "a machine learning algorithm to the data. Here, we use the random forest algorithm \n",
+ "(rf) to train our model. We do 10 folds for our cross-validation, and repeat the \n",
+ "process 5 times.\n",
+ "\n",
+ "```{r}\n",
+ "results <- run_ml(df, \"rf\", outcome_colname = \"geo_location\", kfold = 10, \n",
+ " cv_times = 5, training_frac = 0.8)\n",
+ "```\n",
+ "\n",
+ "4. Model evaluation involves assessing the performance of the trained model. \n",
+ "Here, we use a confusion matrix to evaluate the model's predictions. \n",
+ "\n",
+ "A confusion matrix provides a summary of prediction results on a classification \n",
+ "problem, showing the number of correct and incorrect predictions made by the model \n",
+ "compared to the actual outcomes.\n",
+ "\n",
+ "```{r}\n",
+ "conf_matrix <- confusionMatrix(data = results$trained_model$finalModel$predicted, \n",
+ " reference = results$trained_model$finalModel$y)\n",
+ "```\n",
+ "\n",
+ "5. Evaluate model using `MLEval`. This generates detailed evaluation metrics.\n",
+ "\n",
+ "```{r}\n",
+ "res <- evalm(results$trained_model, showplots = FALSE)\n",
+ "```\n",
+ "\n",
+ "6. Plot ROC and precision-recall curves side-by-side. Visualization is an important \n",
+ "step to understand the performance of the model. Here, we plot the ROC and precision-recall \n",
+ "curves.\n",
+ "\n",
+ "```{r}\n",
+ "roc_pr_plot <- res$roc + res$proc + plot_layout(guides = \"collect\") & theme(legend.position = 'bottom')\n",
+ "```\n",
+ "\n",
+ "# References\n",
+ "\n",
+ "1. McKnight, DT., Huerlimann, R., Bower, DS. et al. \n",
+ "Methods for Normalizing Microbiome Data: An Ecological Perspective. Methods in \n",
+ "Ecology and Evolution / British Ecological Society 10(3):389–400 (2019) \n",
+ "[DOI:10.1111/2041-210X.13115](https://doi.org/10.1111/2041-210X.13115)\n",
+ "\n",
+ "2. McMurdie, PJ., Holmes, S. Waste Not, Want Not: \n",
+ "Why Rarefying Microbiome Data Is Inadmissible. PLoS Comput Biol 10(4):e1003531 (2014) \n",
+ "[DOI:10.1371/journal.pcbi.1003531](https://doi.org/10.1371/journal.pcbi.1003531)\n",
+ "\n",
+ "3. Schloss, Patrick D. ‘Waste Not, Want Not: \n",
+ "Revisiting the Analysis That Called into Question the Practice of Rarefaction’. \n",
+ "mSphere 9, no. 1 (2024): e00355-23 [DOI:10.1128/msphere.00355-23](https://journals.asm.org/doi/10.1128/msphere.00355-23)\n",
+ "\n",
+ "4. Schloss PD. 2024. Rarefaction is currently the \n",
+ "best approach to control for uneven sequencing effort in amplicon sequence analyses. \n",
+ "mSphere 9 [DOI:10.1128/msphere.00354-23](https://journals.asm.org/doi/10.1128/msphere.00354-23)\n",
+ "\n",
+ "5. Lynch, M., Neufeld, J. Ecology and exploration of \n",
+ "the rare biosphere. Nat Rev Microbiol 13:217–229 (2015) [DOI:10.1038/nrmicro3400](https://doi.org/10.1038/nrmicro3400)\n",
+ "\n",
+ "6. Willis, A. Rarefaction, Alpha Diversity, and Statistics. \n",
+ "Front. Microbiol. Sec. Terrestrial Microbiology 10:2407 (2019) [DOI:10.3389/fmicb.2019.02407](https://doi.org/10.3389/fmicb.2019.02407)\n",
+ "\n",
+ "7. Kim, BR., Shin, J., Guevarra, R. et al. Deciphering \n",
+ "Diversity Indices for a Better Understanding of Microbial Communities. J Microbiol \n",
+ "Biotechnol 28;27(12):2089-2093 (2017) [DOI:10.4014/jmb.1709.09027](https://doi.org/10.4014/jmb.1709.09027)\n",
+ "\n",
+ "8. Wagner, BD., Grunwald, GK., Zerbe, GO. et al. On \n",
+ "the Use of Diversity Measures in Longitudinal Sequencing Studies of Microbial \n",
+ "Communities. Front. Microbiol. 9:1037 (2018) [DOI:10.3389/fmicb.2018.01037](https://doi.org/10.3389/fmicb.2018.01037)\n",
+ "\n",
+ "9. Pereira, M., Wallroth, M., Jonsson, V. et al. \n",
+ "Comparison of normalization methods for the analysis of metagenomic gene abundance \n",
+ "data. BMC Genomics 19,274 (2018) [DOI:10.1186/s12864-018-4637-6](https://doi.org/10.1186/s12864-018-4637-6)\n",
+ "\n",
+ "10. McMurdie, P., Holmes, S. Tutorial of 2018: The \n",
+ "distance function in phyloseq. Available at website\n",
+ "\n",
+ "11. Wirbel, J., Zych, K., Essex, M. et al. Microbiome \n",
+ "meta-analysis and cross-disease comparison enabled by the SIAMCAT machine learning \n",
+ "toolbox. Genome Biol 22:93 (2021) [DOI:10.1186/s13059-021-02306-1](https://doi.org/10.1186/s13059-021-02306-1)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/src/notebooks/R Mia Examples/Fetch-Analyses-metadata-for-a-Study.ipynb b/src/notebooks/R Mia Examples/Fetch-Analyses-metadata-for-a-Study.ipynb
new file mode 100644
index 0000000..338271b
--- /dev/null
+++ b/src/notebooks/R Mia Examples/Fetch-Analyses-metadata-for-a-Study.ipynb
@@ -0,0 +1,143 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "---\n",
+ "title: \"Fetch analyses metadata for a study\"\n",
+ "author:\n",
+ " - name: Noah de Gunst\n",
+ " affiliation:\n",
+ " - id: mia\n",
+ " name: Department of Computing, University of Turku, Finland\n",
+ " - name: Sam Hillman\n",
+ " affiliation:\n",
+ " - id: mia\n",
+ " name: Department of Computing, University of Turku, Finland\n",
+ "categories: [R]\n",
+ "execute: \n",
+ " eval: true\n",
+ "---\n",
+ "\n",
+ "::: {style=\"max-width:1200px\"}\n",
+ "![](./../_resources/mgnify_logo.png)\n",
+ "\n",
+ "# Fetch a study using MGnifyR; download the metadata for all of its analyses\n",
+ "\n",
+ "The [MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1) returns data and \n",
+ "relationships as JSON. [MGnifyR](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html) \n",
+ "is a package to help you read MGnify data into your R analyses.\n",
+ "\n",
+ "You can find all of the other \"API endpoints\" using the [Browsable API interface in your web browser](https://www.ebi.ac.uk/metagenomics/api/v1).\n",
+ "\n",
+ "This is an interactive code notebook (a Jupyter Notebook). To run this code, click \n",
+ "into each cell and press the ▶ button in the top toolbar, or press `shift+enter`.\n",
+ "\n",
+ "------------------------------------------------------------------------\n",
+ ":::\n",
+ "\n",
+ "#### Setting the access code\n",
+ "First, we need to specify the accession number of the study we're working with. \n",
+ "This can be done by setting the `mgnify_study_accession` variable. The accession \n",
+ "number uniquely identifies the study in the MGnify database.\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "\n",
+ "source(\"./utils/variable_utils.r\")\n",
+ "\n",
+ "mgnify_study_accession <- get_variable_from_link_or_input('MGYS', 'Study Accession', \n",
+ " 'MGYS00005116')\n",
+ "\n",
+ "# You can also just directly set the accession variable in code, like this:\n",
+ "# mgnify_study_accession <- \"MGYS00005292\"\n",
+ "```\n",
+ "\n",
+ "#### Constructing a MgnifyClient object to access the database\n",
+ "To interact with the MGnify database, we need to create an MgnifyClient object. \n",
+ "This object allows us to fetch data from MGnify, and we can configure it to use \n",
+ "a cache for efficiency. \n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "# Importing the libraries\n",
+ "library(vegan)\n",
+ "library(ggplot2)\n",
+ "library(mia)\n",
+ "library(MGnifyR)\n",
+ "\n",
+ "# Check if the cache directory exists, if not, create it\n",
+ "if (!dir.exists(\"./.mgnify_cache\")) {\n",
+ " dir.create(\"./.mgnify_cache\", recursive = TRUE)\n",
+ "}\n",
+ "\n",
+ "# Create the MgnifyClient object with caching enabled\n",
+ "mg <- MgnifyClient(usecache = TRUE, cache_dir = \"./.mgnify_cache\")\n",
+ "```\n",
+ "\n",
+ "#### Displaying the help file\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "library(IRdisplay)\n",
+ "display_markdown(file = '../_resources/mgnifyr_help.md')\n",
+ "```\n",
+ "\n",
+ "## Fetch a list of the Analyses for the Study\n",
+ "Using the MgnifyClient object, we can search for all analyses associated with the \n",
+ "study accession number we set earlier. This will return a list of analysis accession \n",
+ "numbers.\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "analyses_accessions <- searchAnalysis(mg, \"studies\", mgnify_study_accession)\n",
+ "analyses_accessions\n",
+ "```\n",
+ "\n",
+ "## Download metadata for the first 10 Analyses\n",
+ "...and put it into a dataframe.\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "analyses_metadata_df <- getMetadata(mg, head(analyses_accessions, 10))\n",
+ "```\n",
+ "\n",
+ "## Display metadata\n",
+ "The table could be big, so let's look at a sample of it (`head`).\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "t(head(analyses_metadata_df))\n",
+ "```\n",
+ "\n",
+ "## Download the data to a multi-assay data object\n",
+ "\n",
+ "> [mia](https://microbiome.github.io/mia/) is a Bioconductor package designed to \n",
+ "import, store and analyze microbiome data using an object called a `TreeSummarizedExperiment.` \n",
+ "This is a tailored data container optimized for microbiome data analysis.Being \n",
+ "built on the `SummarizedExperiment` class, miaverse seamlessly integrates into the \n",
+ "extensive `SummarizedExperiment` ecosystem. In this example we download the MGnifyR \n",
+ "data to an MAE, which contains multiple `TreeSummarizedExperiment` objects.\n",
+ "\n",
+ "\n",
+ "```{r}\n",
+ "#| output: false\n",
+ "mae <- getResult(mg, accession = analyses_accessions)\n",
+ "```\n",
+ "\n",
+ "You use `MGnifyR` features further, for example to download data. Check the Cheat \n",
+ "Sheet at the top for more."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/src/notebooks/R Mia Examples/_resources/mgnify_banner.png b/src/notebooks/R Mia Examples/_resources/mgnify_banner.png
new file mode 100644
index 0000000..0c504d2
Binary files /dev/null and b/src/notebooks/R Mia Examples/_resources/mgnify_banner.png differ
diff --git a/src/notebooks/R Mia Examples/_resources/mgnify_logo.png b/src/notebooks/R Mia Examples/_resources/mgnify_logo.png
new file mode 100644
index 0000000..d4f4fd5
Binary files /dev/null and b/src/notebooks/R Mia Examples/_resources/mgnify_logo.png differ
diff --git a/src/notebooks/R Mia Examples/_resources/mgnifyr_help.md b/src/notebooks/R Mia Examples/_resources/mgnifyr_help.md
new file mode 100644
index 0000000..668fa08
--- /dev/null
+++ b/src/notebooks/R Mia Examples/_resources/mgnifyr_help.md
@@ -0,0 +1,12 @@
+# Help with MGnifyR
+
+MGnifyR is an R package that provides a convenient way for R users to access data from [the MGnify API](https://www.ebi.ac.uk/metagenomics/api/).
+
+Detailed help for each function is available in R using the standard `?function_name` command.
+
+A vignette is available containing a reasonably verbose overview of the main functionality.
+This can be read either within R with the `vignette("MGnifyR")` command, or [on the bioconductor vignette website](https://www.bioconductor.org/packages/release/bioc/vignettes/MGnifyR/inst/doc/MGnifyR.html)
+
+## MGnifyR Command cheat sheet
+
+For a full list of key MGnifyR functions, please look at the [MGnifyR website](https://ebi-metagenomics.github.io/MGnifyR/reference/index.html).
diff --git a/src/notebooks/R Mia Examples/utils/kegg_pathways_utils.r b/src/notebooks/R Mia Examples/utils/kegg_pathways_utils.r
new file mode 100644
index 0000000..c2f803e
--- /dev/null
+++ b/src/notebooks/R Mia Examples/utils/kegg_pathways_utils.r
@@ -0,0 +1,60 @@
+# Retrieve name and URL for a specific pathway in the KEGG database
+get_pathway_info <- function(pathway) {
+ pathway <- paste("map", pathway, sep = "")
+ pathway_name <- keggList(pathway)[[1]]
+ pathway_url <- paste("https://www.kegg.jp/pathway/", pathway, sep = "")
+ return(list(pathway_name = pathway_name, pathway_url = pathway_url))
+}
+
+
+# Function to prompt users for pathway selection and return custom pathway IDs
+PathwaysSelection <- function() {
+ display_markdown("#### Pathways Selection :\n\n
+- For the most general & most complete pathways, input 'G'\n\n
+- Press Enter to generate the most complete pathways\n\n
+- To add custom pathways, input pathway numbers (ex: 00053,01220)")
+
+ flush.console()
+ CUSTOM_PATHWAY_IDS <- get_variable_from_link_or_input('CUSTOM_PATHWAY_IDS', name = 'Pathways Accession', default = '')
+
+ if (CUSTOM_PATHWAY_IDS == "") {
+ CUSTOM_PATHWAY_IDS <- list()
+ } else if (CUSTOM_PATHWAY_IDS == "G") {
+ CUSTOM_PATHWAY_IDS <- list("00010", "00020", "00030", "00061", "01232","00240", "00190")
+ } else {
+ CUSTOM_PATHWAY_IDS <- strsplit(CUSTOM_PATHWAY_IDS, ",")[[1]]
+ }
+
+ message(if (length(CUSTOM_PATHWAY_IDS) > 0) {
+ paste("\nUsing", CUSTOM_PATHWAY_IDS, " - ", sapply(CUSTOM_PATHWAY_IDS, function(id) paste(get_pathway_info(id)[1]," : ",get_pathway_info(id)[2])), "as a Custom Pathway")
+ } else {
+ "\nUsing NONE as a Custom Pathway"
+ })
+ return(CUSTOM_PATHWAY_IDS)
+}
+
+
+# Clearing the current working directory and displaying generated figures from `pathway_plots/` directory
+generatePathwayPlots <- function() {
+# Clearing the current working directory
+ if (!dir.exists("pathway_plots")) {
+ dir.create("pathway_plots")
+ }
+
+ file.copy(from = list.files(pattern = "./*pathview.png"), to = "./pathway_plots/", overwrite = TRUE)
+
+ png_files <- list.files(path = ".", pattern = "*.png")
+ xml_files <- list.files(path = ".", pattern = "*.xml")
+ files <- c(png_files, xml_files)
+ output <- capture.output({
+ unlink(files)
+ })
+
+# Accessing the png files and displaying it
+ images <- list.files("pathway_plots", full.names = TRUE)
+
+ for (pathway in images) {
+ display_markdown(get_pathway_info(gsub("[^0-9]", "", basename(pathway)))$pathway_name)
+ display_png(file = pathway)
+ }
+}
\ No newline at end of file
diff --git a/src/notebooks/R Mia Examples/utils/variable_utils.r b/src/notebooks/R Mia Examples/utils/variable_utils.r
new file mode 100644
index 0000000..f6c2cc9
--- /dev/null
+++ b/src/notebooks/R Mia Examples/utils/variable_utils.r
@@ -0,0 +1,16 @@
+library(glue)
+
+get_variable_from_link_or_input <- function(variable, name = 'accession', default = NA) {
+ # Get a variable value, either from an ENV VAR that would have been set by the jlab_query_params extension, or through direct user input.
+ var <- Sys.getenv(variable, unset = NA)
+ if (!is.na(var)) {
+ print(glue('Using {name} = {var} from the link you followed.'))
+ } else {
+ determiner <- ifelse(grepl(tolower(substr(name, 0, 1)), 'aeiou'), 'an', 'a')
+ var <- readline(prompt = glue("Type {determiner} {name} [default: {default}]"))
+ }
+ var <- ifelse(is.na(var) || var == '', default, var)
+ print(glue('Using "{var}" as {name}'))
+ var
+}
+