Merge branch 'devel' into remove-nix

microbiome · Nov 14, 2024 · ffd2bb9 · ffd2bb9
2 parents c3afaec + 3733264
commit ffd2bb9
Show file tree

Hide file tree

Showing 6 changed files with 116 additions and 74 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,19 +1,14 @@
 Package: OMA
 Title: Orchestrating Microbiome Analysis with Bioconductor
-Version: 0.98.28
+Version: 0.98.30
 Date: 2024-10-04
 Authors@R:
-    c(person("Leo", "Lahti", role = c("aut"),
-             comment = c(ORCID = "0000-0001-5537-637X")),
-      person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
-             email = "[email protected]",
-             comment = c(ORCID = "0000-0002-8563-8884")),
-      person("Felix GM", "Ernst", email = "[email protected]",
-             role = c("aut"),
-             comment = c(ORCID = "0000-0001-5064-0928")),
-      person("and others", "(see the full list of contributors)",
-             role = c("ctb"))
-         )
+    c(
+  person(given = "Tuomas", family = "Borman", role = c("aut", "cre"), email = "[email protected]", comment = c(ORCID = "0000-0002-8563-8884")),
+  person("Leo", "Lahti", role = c("aut"), comment = c(ORCID = "0000-0001-5537-637X")),
+  person("Felix GM", "Ernst", email = "[email protected]", role = c("aut"), comment = c(ORCID = "0000-0001-5064-0928")),
+  person("and others", "(see the full list of contributors)", role = c("ctb"))
+  )
 Description:
     This is a reference cookbook for **Microbiome Data Science** with
     R and Bioconductor.
@@ -43,6 +38,7 @@ Suggests:
     cobiclust,
     ComplexHeatmap,
     corpcor,
+    cowplot,
     curatedMetagenomicData,
     dada2,
     dendextend,
@@ -51,16 +47,16 @@ Suggests:
     dplyr,
     DT,
     factoextra,
-    forcats,
     fido,
+    forcats,
     ggplot2,
     ggpubr,
     ggtree,
     glmnet,
     glue,
     grid,
-    gtools,
     gsEasy,
+    gtools,
     igraph,
     IntegratedLearner,
     knitr,
@@ -75,29 +71,31 @@ Suggests:
     MMUPHin,
     MOFA2,
     multiview,
-    NetCoMi,
     NbClust,
+    NetCoMi,
     NMF,
     patchwork,
     phyloseq,
     plotly,
+    plotROC,
     purrr,
     qgraph,
     RColorBrewer,
     rebook,
     reshape2,
     reticulate,
     rgl,
-    ROCR,
     scales,
     scater,
+    scuttle,
     sechm,
     sessioninfo,
     shadowtext,
     SpiecEasi,
     SPRING,
     stats,
     stringr,
+    SuperLearner,
     tidyverse,
     topGO,
     vegan,
@@ -112,7 +110,7 @@ Remotes:
     github::GraceYoon/SPRING,
     github::himelmallick/IntegratedLearner
 VignetteBuilder: knitr
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 BiocType: Book
 BiocBookTemplate: 1.0.5
 SystemRequirements: quarto
diff --git a/PackageInstallations_Troubleshoots.qmd b/PackageInstallations_Troubleshoots.qmd
@@ -16,7 +16,7 @@ First of all, please ensure that you have an up-to-date version of R
 ## Mac M1 user
 
 When attempting to install miaverse packages, you may encounter
-installation failures related to the 'scuttle' and 'scatter'
+installation failures related to the 'scuttle' and 'scater'
 dependencies, which require a gcc compiler for installation. The error
 message might resemble the following:
 

diff --git a/inst/pages/integrated_learner.qmd b/inst/pages/integrated_learner.qmd
@@ -196,7 +196,7 @@ provides us with the overall importance of each feature in the final model.
 ```{r}
 #| label: feat_importance
 
-library(ggplot2)
+library(miaViz)
 
 # Get individual models
 models <- fit$model_fits$model_layers
@@ -208,28 +208,18 @@ importances <- lapply(seq_len(length(models)), function(i){
   temp <- temp * fit$weights[[i]]
   return(temp)
   })
-# Combine and order to most important features
+# Combine the feature importances
 importances <- do.call(rbind, importances)
-importances <- importances[
-  order(importances, decreasing = TRUE), , drop = FALSE]
-# Add features to column
-importances <- importances |> as.data.frame()
-importances[["Feature"]] <- factor(
-  rownames(importances), levels = rownames(importances))
-# Convert to 0-1 scale
-importances[[1]] <- importances[[1]] / sum(importances[[1]])
-# Get top 20 importances
-top_n <- 20
-importances <- importances[ seq_len(top_n), ]
-
-# Plot as a bar plot
-p <- ggplot(importances, aes(x = MeanDecreaseGini, y = Feature)) +
-  geom_bar(stat = "identity")
+
+# Plot feature importances
+p <- plotLoadings(importances, ncomponents = 1, n = 20, show.color = FALSE)
 p
 ```
 
-From the plot, we can observe that _`r importances[1, "Feature"]`_ and
-_`r importances[1, "Feature"]`_ appear to have the greatest predictive power
+From the plot, we can observe that
+_`r rownames(importances)[order(importances, decreasing = TRUE)][[1]]`_ and
+_`r rownames(importances)[order(importances, decreasing = TRUE)][[2]]`_ appear
+to have the greatest predictive power
 among all the features in determining the outcome. However, the predictive
 power appears to be fairly evenly distributed across all features.
 

diff --git a/inst/pages/machine_learning.qmd b/inst/pages/machine_learning.qmd
@@ -196,7 +196,8 @@ model <- train(
     tuneGrid = tune_grid,
     trControl = train_control,
     weights = class_weights,
-    max_delta_step = 1
+    max_delta_step = 1,
+    verbosity = 0
 )
 
 # Get predictions
@@ -211,41 +212,37 @@ technique for binary classification problems.
 ```{r}
 #| label: ROC
 
-library(ROCR)
+library(plotROC)
 
-# Get positive class
-pos_class <-levels(res[["obs"]])[[1]]
-# Create ROC plot
-pred <- prediction(res[[pos_class]], ifelse(res[["obs"]] == pos_class, 1, 0))
-perf <- performance(pred, measure = "tpr", x.measure = "fpr")
-p <- plot(perf)
+# Prepare data for ROC
+roc_data <- data.frame(
+    observed_class = as.numeric(res[["obs"]] == "healthy"),
+    predicted_probability = res[["healthy"]]
+    )
+
+# Plot ROC curve
+p <- ggplot(roc_data, aes(m = predicted_probability, d = observed_class)) +
+  geom_roc() +
+  style_roc(theme = theme_minimal())
 p
 ```
 
-XGBoost model returns also feature importances that can be visualized with bar
+XGBoost model also returns feature significance that can be visualized with bar
 plot.
 
 ```{r}
 #| label: xgboost_feat
 
 library(xgboost)
+library(miaViz)
+
+# Get feature importance and convert to matrix
+df <- xgb.importance(model = model$finalModel) |> as.data.frame()
+rownames(df) <- df[["Feature"]]
+df <- as.matrix(df[, "Gain", drop = FALSE])
 
-# Get feature importance
-df <- xgb.importance(model = model$finalModel)
-# Take top 20 features
-df <- df[seq_len(20), ]
-# Factorize to preserve order
-df[["Feature"]] <- factor(df[["Feature"]], levels = df[["Feature"]])
-# Round values, add percentage symbol
-df[["Percentage"]] <- paste0(round(df[["Gain"]], 3)*100, "%")
-
-# Create a plot
-p <- ggplot(df, aes(x = Feature, y = Gain)) +
-    geom_bar(stat = "identity") +
-    geom_text(aes(label = Percentage), hjust = -0.1, size = 2.5) +
-    expand_limits(y = max(df[["Gain"]]) + 0.01) +
-    scale_y_continuous(labels = scales::percent) +
-    coord_flip()
+# Create plot for top 20 features
+p <- plotLoadings(df, ncomponents = 1, n = 20, show.color = FALSE)
 p
 ```
 

diff --git a/inst/pages/taxonomy.qmd b/inst/pages/taxonomy.qmd
@@ -130,9 +130,47 @@ table. For instance, we can check all the taxa that matches with "Escherichia".
 
 ```{r}
 #| label: mapTaxonomy
-mapTaxonomy(GlobalPatterns, taxa = "Escherichia")
+mapTaxonomy(tse, taxa = "Escherichia")
 ```
 
+## Prune taxonomy tree {#sec-update-tree}
+
+Subsetting is explained in detail in [@sec-treese_subsetting]. However, if
+you've already subsetted your data, you may have noticed that the taxonomy tree
+does not automatically update when using the `[]` operators. Although the
+linakges between rows and tree nodes remain correct, the tree retains its
+original, complex structure. You may be wondering how to update the tree to
+reflect the newly simplified data.
+
+`mia` package functions `subsetBy*` and `agglomerateBy*` (see
+[@sec-agglomeration]) include an `update.tree` parameter to handle this
+adjustment. However, when using `[]`, tree pruning must be done as an additional
+step.
+
+Let's start by selecting 5 arbitrary rows.
+
+```{r}
+#| label: subset_tse
+
+tse_sub <- tse[1:5, ]
+tse_sub
+```
+
+ven though we have only 5 rows, the tree still retains its original number of
+tips. To align the tree with the subsetted data, we can use the
+`TreeSummarizedExperiment::subsetByLeaf()` function, which allows us to select
+specific tips from the tree, effectively pruning it to match the current subset
+of data.
+
+```{r}
+#| label: subset_tree
+
+tse_sub <- subsetByLeaf(tse_sub, rowLeaf = rownames(tse_sub))
+tse_sub
+```
+
+Now, we can see that the taxonomy tree has a simpler structure, including only
+the selected leaves.
 
 ## Generate a hierarchy tree on the fly {#sec-fly-tree}
 

diff --git a/inst/pages/training.qmd b/inst/pages/training.qmd
@@ -12,8 +12,10 @@ The page provides practical information to support training and self-study.
 Brief checklist to prepare for training (see below for links).
 
 - Install the recommended software
+
 - If the time allows, watch the short online videos and familiarize yourself
 with the other available material
+
 - Join Gitter online chat for support
 
 ## Recommended software {#sec-software}
@@ -32,21 +34,22 @@ information. RStudio is optional.
 - Install key R packages (Section [@sec-ecosystem] provides an
 installation script)
 
-- After a successful installation you can consider trying out examples
-from Section [@sec-exercises] already before training. **You can run
-the workflows by simply copy-pasting examples.** You can then test
-further examples from this tutorial, modifying and applying these
-techniques to your own data. Plain source code for the individual chapters
-of this book are available via
-[Github](https://github.com/microbiome/OMA/tree/master/R)
-
-- If you have access to CSC notebook you can find instructions from
-[here](https://microbiome.github.io/outreach/).
+Once you've successfully installed the software, consider exploring examples
+from Section [@sec-exercises] even before starting the training. Running the
+workflows is easy — just copy and paste the examples. You can then try
+additional examples from the book, adapting and applying the techniques to your
+own data. Source code for each chapter is available on
+[Github](https://github.com/microbiome/OMA).
 
 ## Study material {#sec-material}
 
 We encourage you to familiarize yourself with the material and test examples
-in advance but this is optional:
+in advance but this is optional. If you're new to this topic, the following
+resources may be particularly helpful.
+
+- [Interactive R tutorials for beginners](https://rstudio.github.io/learnr/articles/examples.html)
+
+- [Introduction to R](https://noppe.2.rahtiapp.fi/main/catalog) (available in CSC Noppe, see [@sec-vm])
 
 - [Introduction to data analysis with R and Bioconductor](https://carpentries-incubator.github.io/bioc-intro/) (for beginners with R)
 
@@ -61,3 +64,19 @@ in advance but this is optional:
 - @sec-exercises for self-study
 
 - @sec-resources and links to complementary external material
+
+## Virtual machines and learning environments {#sec-vm}
+
+In most of the training courses, we use learning environments that have
+necessary software installed. Check from course details, if this applied to the
+course that you are participating.
+
+In most training courses, we use learning environments with the required software pre-installed. Please check the course details to see if this applies to your course.
+
+- CSC Noppe (formerly Notebooks) is available for users with accounts at Finnish
+higher education institutions or state research institutes. For more
+information, visit [this page](https://microbiome.github.io/outreach/).
+
+- [Bioconductor workshops](https://workshop.bioconductor.org/) are accessible
+with a Bioconductor account, providing pre-installed Bioconductor software and
+workshops.