diff --git a/DESCRIPTION b/DESCRIPTION index 1fedb91d6..cc6be3f8a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: mia Type: Package -Version: 1.15.19 +Version: 1.15.20 Authors@R: c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"), email = "tuomas.v.borman@utu.fi", @@ -75,7 +75,6 @@ Imports: IRanges, MASS, MatrixGenerics, - mediation, methods, rbiom, rlang, @@ -83,6 +82,7 @@ Imports: scater, scuttle, stats, + stringr, tibble, tidyr, utils, @@ -93,6 +93,7 @@ Suggests: biomformat, dada2, knitr, + mediation, miaViz, microbiomeDataSets, NMF, @@ -102,7 +103,6 @@ Suggests: reldist, rhdf5, rmarkdown, - stringr, testthat, topicdoc, topicmodels, diff --git a/NAMESPACE b/NAMESPACE index 739a0b590..8b940728d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -400,10 +400,10 @@ importFrom(dplyr,group_by) importFrom(dplyr,mutate) importFrom(dplyr,n) importFrom(dplyr,rename) +importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,sym) importFrom(dplyr,tally) -importFrom(mediation,mediate) importFrom(rbiom,unifrac) importFrom(rlang,":=") importFrom(rlang,sym) @@ -429,11 +429,18 @@ importFrom(stats,runif) importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,terms) +importFrom(stringr,str_extract) +importFrom(stringr,str_extract_all) +importFrom(stringr,str_pad) +importFrom(stringr,str_replace_all) importFrom(tibble,rownames_to_column) importFrom(tibble,tibble) +importFrom(tidyr,ends_with) importFrom(tidyr,pivot_longer) importFrom(tidyr,pivot_wider) +importFrom(tidyr,starts_with) importFrom(tidyr,unnest) +importFrom(tidyr,unnest_wider) importFrom(utils,assignInMyNamespace) importFrom(utils,combn) importFrom(utils,getFromNamespace) diff --git a/R/convertFromBIOM.R b/R/convertFromBIOM.R index 9e68d6eee..fe0c61a5c 100644 --- a/R/convertFromBIOM.R +++ b/R/convertFromBIOM.R @@ -1,86 +1,86 @@ #' Convert a \code{TreeSummarizedExperiment} object to/from \sQuote{BIOM} #' results #' -#' For convenience, a few functions are available to convert BIOM, DADA2 and -#' phyloseq objects to +#' For convenience, a few functions are available to convert BIOM, DADA2 and +#' phyloseq objects to #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -#' objects, and +#' objects, and #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} #' objects to phyloseq objects. -#' +#' #' @param prefix.rm \code{Logical scalar}. Should #' taxonomic prefixes be removed? The prefixes is removed only from detected #' taxa columns meaning that \code{rank.from.prefix} should be enabled in the #' most cases. (Default: \code{FALSE}) -#' +#' #' @param removeTaxaPrefixes Deprecated. Use \code{prefix.rm} instead. -#' +#' #' @param rank.from.prefix \code{Logical scalar}. If file does not have #' taxonomic ranks on feature table, should they be scraped from prefixes? #' (Default: \code{FALSE}) -#' +#' #' @param rankFromPrefix Deprecated.Use \code{rank.from.prefix} instead. -#' +#' #' @param artifact.rm \code{Logical scalar}. If file have #' some taxonomic character naming artifacts, should they be removed. #' (default (Default: \code{FALSE}) -#' +#' #' @param remove.artifacts Deprecated. Use \code{artifact.rm} instead. -#' -#' @details +#' +#' @details #' \code{convertFromBIOM} coerces a \code{\link[biomformat:biom-class]{biom}} -#' object to a +#' object to a #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} #' object. -#' +#' #' \code{convertToBIOM} coerces a #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} #' object to a \code{\link[biomformat:biom-class]{biom}} object. -#' +#' #' @return #' \code{convertFromBIOM} returns an object of class #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -#' +#' #' @name importBIOM -#' +#' #' @seealso #' \code{\link[=importQIIME2]{importQIIME2}} #' \code{\link[=importMothur]{importMothur}} #' #' @examples -#' +#' #' # Convert BIOM results to a TreeSE #' # Load biom file #' library(biomformat) #' biom_file <- system.file("extdata", "rich_dense_otu_table.biom", #' package = "biomformat") -#' +#' #' # Make TreeSE from BIOM object #' biom_object <- biomformat::read_biom(biom_file) #' tse <- convertFromBIOM(biom_object) -#' +#' #' # Convert TreeSE object to BIOM #' biom <- convertToBIOM(tse) -#' +#' NULL #' Import BIOM results to \code{TreeSummarizedExperiment} -#' +#' #' @param file BIOM file location -#' +#' #' @param ... additional arguments to be passed to \code{convertFromBIOM} -#' +#' #' @details -#' \code{importBIOM} loads a BIOM file and creates a -#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} +#' \code{importBIOM} loads a BIOM file and creates a +#' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} #' object from the BIOM object contained in the loaded file. -#' -#' @return +#' +#' @return #' \code{importBIOM} returns an object of class #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -#' +#' #' @name importBIOM -#' +#' #' @seealso #' \code{\link[=importMetaPhlAn]{importMetaPhlAn}} #' \code{\link[=convertFromPhyloseq]{convertFromPhyloseq}} @@ -95,21 +95,21 @@ NULL #' library(biomformat) #' biom_file <- system.file( #' "extdata", "rich_dense_otu_table.biom", package = "biomformat") -#' +#' #' # Make TreeSE from biom file #' tse <- importBIOM(biom_file) -#' +#' #' # Get taxonomyRanks from prefixes and remove prefixes #' tse <- importBIOM( #' biom_file, rank.from.prefix = TRUE, prefix.rm = TRUE) -#' +#' #' # Load another biom file #' biom_file <- system.file( #' "extdata", "Aggregated_humanization2.biom", package = "mia") -#' +#' #' # Clean artifacts from taxonomic data #' tse <- importBIOM(biom_file, artifact.rm = TRUE) -#' +#' #' @export importBIOM <- function(file, ...) { .require_package("biomformat") @@ -118,15 +118,15 @@ importBIOM <- function(file, ...) { } #' @rdname importBIOM -#' +#' #' @param x object of type \code{\link[biomformat:biom-class]{biom}} #' #' @export #' @importFrom S4Vectors make_zero_col_DFrame DataFrame #' @importFrom dplyr %>% bind_rows convertFromBIOM <- function( - x, prefix.rm = removeTaxaPrefixes, - removeTaxaPrefixes = FALSE, rank.from.prefix = rankFromPrefix, + x, prefix.rm = removeTaxaPrefixes, + removeTaxaPrefixes = FALSE, rank.from.prefix = rankFromPrefix, rankFromPrefix = FALSE, artifact.rm = remove.artifacts, remove.artifacts = FALSE, ...){ # input check @@ -147,7 +147,7 @@ convertFromBIOM <- function( counts <- as(biomformat::biom_data(x), "matrix") sample_data <- biomformat::sample_metadata(x) feature_data <- biomformat::observation_metadata(x) - + # colData is initialized with empty tables with rownames if it is NULL if( is.null(sample_data) ){ sample_data <- S4Vectors::make_zero_col_DFrame(ncol(counts)) @@ -164,15 +164,15 @@ convertFromBIOM <- function( rownames(feature_data) <- rownames(counts) # Otherwise convert it into correct format if it is a list } else if( is(feature_data, "list") ){ - # Feature data is a list of taxa info. Dfs are merged together - # differently than sample metadata since the column names are only - # "Taxonomy". If there is only one taxonomy level, the column name does + # Feature data is a list of taxa info. Dfs are merged together + # differently than sample metadata since the column names are only + # "Taxonomy". If there is only one taxonomy level, the column name does # not get a suffix. # --> bind rows based on the index of column. - + # Get the maximum length of list max_length <- max( lengths(feature_data) ) - # Get the column names from the taxa info that has all the levels that + # Get the column names from the taxa info that has all the levels that # occurs in the data colnames <- names( head( feature_data[ lengths(feature_data) == max_length ], 1)[[1]]) @@ -181,7 +181,7 @@ convertFromBIOM <- function( # have all the levels. E.g., if only Kingdom level is found, all lower # ranks are now NA feature_data <- lapply(feature_data, function(x){ - length(x) <- max_length + length(x) <- max_length return(x) }) # Create a data.frame from the list @@ -203,7 +203,7 @@ convertFromBIOM <- function( feature_data <- cbind(tax_tab, feature_data) feature_data <- as.data.frame(feature_data) } - + # Clean feature_data from possible character artifacts if specified. if( artifact.rm ){ feature_data <- .detect_taxa_artifacts_and_clean(feature_data, ...) @@ -224,7 +224,7 @@ convertFromBIOM <- function( # Adjust row and colnames rownames(counts) <- rownames(feature_data) <- biomformat::rownames(x) colnames(counts) <- rownames(sample_data) <- biomformat::colnames(x) - + # Convert into DataFrame sample_data <- DataFrame(sample_data) feature_data <- DataFrame(feature_data) @@ -245,15 +245,15 @@ convertFromBIOM <- function( } #' @rdname importBIOM -#' +#' #' @param x #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} -#' +#' #' @param assay.type \code{Character scaler}. The name of assay. #' (Default: \code{"counts"}) -#' +#' #' @param ... Additional arguments. Not used currently. -#' +#' #' @export setMethod( "convertToBIOM", signature = c(x = "SummarizedExperiment"), @@ -287,8 +287,8 @@ setMethod( stop("'ignore.col' must be a character value or NULL.", call. = FALSE) } # - # Subset by taking only taxonomy info if user want to remove the pattern - # only from those. (Might be too restricting, e.g., if taxonomy columns are + # Subset by taking only taxonomy info if user want to remove the pattern + # only from those. (Might be too restricting, e.g., if taxonomy columns are # not detected in previous steps. That is way the default is FALSE) ind <- rep(TRUE, ncol(feature_tab)) if( only.taxa.col ){ @@ -334,10 +334,11 @@ setMethod( if( sum(found_rank) == 1 ){ colname <- names(prefixes)[found_rank] } - return(colname) + return(colname) } # Detect and clean non wanted characters from Taxonomy data if needed. +#' @importFrom stringr str_extract_all .detect_taxa_artifacts_and_clean <- function( x, pattern = "auto", ignore.col = "taxonomy_unparsed", ...) { # @@ -360,13 +361,12 @@ setMethod( if( ncol(x) > 0 ){ # Remove artifacts if( pattern == "auto" ){ - .require_package("stringr") # Remove all but these characters pattern <- "[[:alnum:]]|-|_|\\[|\\]|,|;\\||[[:space:]]" x <- lapply(x, function(col){ # Take all specified characters as a matrix where each column # is a character - temp <- stringr::str_extract_all( + temp <- str_extract_all( col, pattern = pattern, simplify = TRUE) # Collapse matrix to strings temp <- apply(temp, 1, paste, collapse = "") @@ -403,7 +403,7 @@ setMethod( # Check if assay contains integers or floats. biom constructor # requires that information since the default value is "int". mat_type <- ifelse(all(assay %% 1 == 0), "int", "float") - + # Create argument list args <- list(data = assay, matrix_element_type = mat_type) # Add rowData and colData only if they contain information diff --git a/R/convertFromDADA2.R b/R/convertFromDADA2.R index e2de656d8..a1da10198 100644 --- a/R/convertFromDADA2.R +++ b/R/convertFromDADA2.R @@ -1,20 +1,20 @@ #' Create a \code{TreeSummarizedExperiment} object from \sQuote{DADA2} results #' -#' @param ... Additional arguments. For \code{convertFromDADA2}, see +#' @param ... Additional arguments. For \code{convertFromDADA2}, see #' \code{mergePairs} function for more details. #' #' @details -#' +#' #' \code{convertFromDADA2} is a wrapper for the #' \code{mergePairs} function from the \code{dada2} package. -#' A count matrix is constructed via -#' \code{makeSequenceTable(mergePairs(...))} and rownames are dynamically -#' created as \code{ASV(N)} with \code{N} from 1 to \code{nrow} of the count +#' A count matrix is constructed via +#' \code{makeSequenceTable(mergePairs(...))} and rownames are dynamically +#' created as \code{ASV(N)} with \code{N} from 1 to \code{nrow} of the count #' tables. The colnames and rownames from the output of \code{makeSequenceTable} -#' are stored as \code{colnames} and in the \code{referenceSeq} slot of the +#' are stored as \code{colnames} and in the \code{referenceSeq} slot of the #' \code{TreeSummarizedExperiment}, respectively. #' -#' @return \code{convertFromDADA2} returns an object of class +#' @return \code{convertFromDADA2} returns an object of class #' \code{\link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment}} #' #' @importFrom S4Vectors SimpleList @@ -25,7 +25,7 @@ #' @export #' #' @examples -#' +#' #' ### Coerce DADA2 results to a TreeSE object #' if(requireNamespace("dada2")) { #' fnF <- system.file("extdata", "sam1F.fastq.gz", package="dada2") @@ -36,19 +36,17 @@ #' tse <- convertFromDADA2(dadaF, fnF, dadaR, fnR) #' tse #' } +#' @importFrom stringr str_pad convertFromDADA2 <- function(...) { # input checks .require_package("dada2") - .require_package("stringr") # mergers <- dada2::mergePairs(...) seqtab <- dada2::makeSequenceTable(mergers) seqtab <- t(seqtab) # generate row and col names rName <- paste0("ASV", - stringr::str_pad(seq.int(1L,nrow(seqtab)), - nchar(nrow(seqtab)) + 1L, - pad="0")) + str_pad(seq.int(1L,nrow(seqtab)), nchar(nrow(seqtab)) + 1L, pad="0")) cName <- colnames(seqtab) # retrieve count data and reference sequence assays <- S4Vectors::SimpleList(counts = unname(seqtab)) diff --git a/R/mediate.R b/R/mediate.R index daaef2d29..8ebf4a687 100644 --- a/R/mediate.R +++ b/R/mediate.R @@ -3,13 +3,13 @@ #' \code{getMediation} and \code{addMediation} provide a wrapper of #' \code{\link[mediation:mediate]{mediate}} for #' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}. -#' +#' #' @param x a #' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}}. -#' +#' #' @param outcome \code{Character scalar}. Indicates the colData variable used #' as outcome in the model. -#' +#' #' @param treatment \code{Character scalar}. Indicates the colData variable #' used as treatment in the model. #' @@ -18,34 +18,37 @@ #' #' @param assay.type \code{Character scalar}. Specifies the assay used for #' feature-wise mediation analysis. (Default: \code{NULL}) -#' +#' #' @param dimred \code{Character scalar}. Indicates the reduced dimension #' result in \code{reducedDims(object)} for component-wise mediation analysis. #' (Default: \code{NULL}) #' #' @param family \code{Character scalar}. A specification for the outcome model #' link function. (Default: \code{gaussian("identity")}) -#' +#' #' @param covariates \code{Character scalar} or \code{character vector}. #' Indicates the colData variables used as covariates in the model. #' (Default: \code{NULL}) -#' +#' #' @param p.adj.method \code{Character scalar}. Selects adjustment method #' of p-values. Passed to `p.adjust` function. #' (Default: \code{"holm"}) -#' +#' #' @param add.metadata \code{Logical scalar}. Should the model metadata be -#' returned. (Default: \code{FALSE}) -#' +#' returned. (Default: \code{TRUE}) +#' +#' @param sort \code{Logical scalar}. Should the results be sorted by decreasing +#' significance in terms of ACME_pval. (Default: \code{FALSE}) +#' #' @param verbose \code{Logical scalar}. Should execution messages be printed. #' (Default: \code{TRUE}) -#' -#' @param name \code{Character scalar}. A name for the column of the +#' +#' @param name \code{Character scalar}. A name for the column of the #' \code{colData} where results will be stored. (Default: \code{"mediation"}) -#' +#' #' @param ... additional parameters that can be passed to #' \code{\link[mediation:mediate]{mediate}}. -#' +#' #' @details #' This wrapper of \code{\link[mediation:mediate]{mediate}} for #' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} @@ -54,97 +57,115 @@ #' another variable in colData (argument \code{mediator}) or an assay #' (argument \code{assay.type}) or a reducedDim (argument \code{dimred}). #' Importantly, those three arguments are mutually exclusive. -#' -#' @return +#' +#' @return #' \code{getMediation} returns a \code{data.frame} of adjusted p-values and #' effect sizes for the ACMEs and ADEs of every mediator given as input, whereas #' \code{addMediation} returns an updated #' \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} #' instance with the same \code{data.frame} stored in the metadata as -#' "mediation". Its columns include: -#' +#' "mediation" or as specified in the \code{name} argument. Its columns include: +#' #' \describe{ -#' \item{Mediator}{the mediator variable} -#' \item{ACME_estimate}{the Average Causal Mediation Effect (ACME) estimate} -#' \item{ADE_estimate}{the Average Direct Effect (ADE) estimate} -#' \item{ACME_pval}{the adjusted p-value for the ACME estimate} -#' \item{ADE_pval}{the adjusted p-value for the ADE estimate} +#' \item{mediator}{the mediator variable} +#' \item{acme}{the Average Causal Mediation Effect (ACME) estimate} +#' \item{acme_pval}{the original p-value for the ACME estimate} +#' \item{acme_lower}{the lower bound of the CI for the ACME estimate} +#' \item{acme_upper}{the upper bound of the CI for the ACME estimate} +#' \item{ade}{the Average Direct Effect (ADE) estimate} +#' \item{ade_pval}{the original p-value for the ADE estimate} +#' \item{ade_lower}{the lower bound of the CI for the ADE estimate} +#' \item{ade_upper}{the upper bound of the CI for the ADE estimate} +#' \item{total}{the Total Effect estimate} +#' \item{total_lower}{the lower bound of the CI for the Total Effect estimate} +#' \item{total_upper}{the upper bound of the CI for the Total Effect estimate} +#' \item{total_pval}{the original p-value for the Total Effect estimate} +#' \item{acme_padj}{the adjusted p-value for the ACME estimate} +#' \item{ade_padj}{the adjusted p-value for the ADE estimate} +#' \item{total_padj}{the adjusted p-value for the Total Effect estimate} #' } #' +#' The original output of \code{\link[mediation:mediate]{mediate}} for each +#' analysed mediator is stored as the "model_metadata" attribute of the +#' resulting \code{data.frame} and can be accessed via the \code{attr} function. +#' #' @name getMediation #' #' @examples #' \dontrun{ #' # Import libraries #' library(mia) +#' library(miaViz) #' library(scater) -#' +#' #' # Load dataset #' data(hitchip1006, package = "miaTime") #' tse <- hitchip1006 -#' +#' #' # Agglomerate features by family (merely to speed up execution) #' tse <- agglomerateByRank(tse, rank = "Phylum") #' # Convert BMI variable to numeric #' tse$bmi_group <- as.numeric(tse$bmi_group) -#' +#' #' # Analyse mediated effect of nationality on BMI via alpha diversity #' # 100 permutations were done to speed up execution, but ~1000 are recommended -#' med_df <- getMediation(tse, -#' outcome = "bmi_group", -#' treatment = "nationality", -#' mediator = "diversity", -#' covariates = c("sex", "age"), -#' treat.value = "Scandinavia", -#' control.value = "CentralEurope", -#' boot = TRUE, sims = 100, -#' add.metadata = TRUE) -#' -#' # Visualise model statistics for 1st mediator -#' plot(attr(med_df, "metadata")[[1]]) -#' +#' med_df <- getMediation( +#' tse, +#' outcome = "bmi_group", +#' treatment = "nationality", +#' mediator = "diversity", +#' covariates = c("sex", "age"), +#' treat.value = "Scandinavia", +#' control.value = "CentralEurope", +#' boot = TRUE, sims = 100) +#' +#' # Visualise model statistics +#' plotMediation(med_df) +#' #' # Apply clr transformation to counts assay -#' tse <- transformAssay(tse, -#' method = "clr", -#' pseudocount = 1) +#' tse <- transformAssay(tse, method = "clr", pseudocount = 1) #' #' # Analyse mediated effect of nationality on BMI via clr-transformed features #' # 100 permutations were done to speed up execution, but ~1000 are recommended -#' tse <- addMediation(tse, name = "assay_mediation", -#' outcome = "bmi_group", -#' treatment = "nationality", -#' assay.type = "clr", -#' covariates = c("sex", "age"), -#' treat.value = "Scandinavia", -#' control.value = "CentralEurope", -#' boot = TRUE, sims = 100, -#' p.adj.method = "fdr") -#' +#' tse <- addMediation( +#' tse, name = "assay_mediation", +#' outcome = "bmi_group", +#' treatment = "nationality", +#' assay.type = "clr", +#' covariates = c("sex", "age"), +#' treat.value = "Scandinavia", +#' control.value = "CentralEurope", +#' boot = TRUE, sims = 100, +#' p.adj.method = "fdr") +#' #' # Show results for first 5 mediators #' head(metadata(tse)$assay_mediation, 5) -#' +#' #' # Perform ordination -#' tse <- runMDS(tse, name = "MDS", -#' method = "euclidean", -#' assay.type = "clr", -#' ncomponents = 3) -#' +#' tse <- runMDS( +#' tse, name = "MDS", method = "euclidean", +#' assay.type = "clr", ncomponents = 3) +#' #' # Analyse mediated effect of nationality on BMI via NMDS components #' # 100 permutations were done to speed up execution, but ~1000 are recommended -#' tse <- addMediation(tse, name = "reddim_mediation", -#' outcome = "bmi_group", -#' treatment = "nationality", -#' dimred = "MDS", -#' covariates = c("sex", "age"), -#' treat.value = "Scandinavia", -#' control.value = "CentralEurope", -#' boot = TRUE, sims = 100, -#' p.adj.method = "fdr") -#' +#' tse <- addMediation( +#' tse, name = "reddim_mediation", +#' outcome = "bmi_group", +#' treatment = "nationality", +#' dimred = "MDS", +#' covariates = c("sex", "age"), +#' treat.value = "Scandinavia", +#' control.value = "CentralEurope", +#' boot = TRUE, sims = 100, +#' p.adj.method = "fdr") +#' #' # Show results for first 5 mediators #' head(metadata(tse)$reddim_mediation, 5) +#' +#' # Access model metadata +#' attr(metadata(tse)$reddim_mediation, "model_metadata") #' } -#' +#' NULL #' @rdname getMediation @@ -154,15 +175,15 @@ setMethod("addMediation", signature = c(x = "SummarizedExperiment"), function(x, outcome, treatment, name = "mediation", mediator = NULL, assay.type = NULL, dimred = NULL, family = gaussian(), covariates = NULL, p.adj.method = "holm", - add.metadata = FALSE, verbose = TRUE, ...) { - + add.metadata = TRUE, verbose = TRUE, ...) { + med_df <- getMediation( x, outcome, treatment, mediator, assay.type, dimred, family, covariates, p.adj.method, add.metadata, verbose, ... ) - + x <- .add_values_to_metadata(x, name, med_df) return(x) } @@ -176,7 +197,7 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), function(x, outcome, treatment, mediator = NULL, assay.type = NULL, dimred = NULL, family = gaussian(), covariates = NULL, p.adj.method = "holm", - add.metadata = FALSE, verbose = TRUE, ...) { + add.metadata = TRUE, sort = FALSE, verbose = TRUE, ...) { ###################### Input check ######################## if( !outcome %in% names(colData(x)) ){ stop(outcome, " not found in colData(x).", call. = FALSE) @@ -193,18 +214,18 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), if( !.is_a_bool(verbose) ){ stop("verbose must be TRUE or FALSE.", call. = FALSE) } - + # Check that arguments can be passed to mediate and filter samples - x <- .check.mediate.args( + x <- .check_mediate_args( x, outcome, treatment, mediator, covariates, verbose, ... ) - + # Check which mediator was provided (colData, assay or reducedDim) med_opts <- unlist(lapply( list(mediator = mediator, assay.type = assay.type, dimred = dimred), function(x) !is.null(x) )) - + if( sum(med_opts) != 1 ){ # Throw error if none or multiple mediation options are specified stop( @@ -213,13 +234,13 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), call. = FALSE ) } - + if ( med_opts[[1]] ){ # Check that mediator is in colData if( !mediator %in% names(colData(x)) ) { stop(mediator, " not found in colData(x).", call. = FALSE) } - # Use mediator for analysis + # Use mediator for analysis mediators <- mediator mat <- NULL } else if( med_opts[[2]] ){ @@ -241,17 +262,14 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), # Use reducedDim for analysis mediators <- rownames(mat) } - - # Create template list of results - results <- list( - Mediator = c(), ACME_estimate = c(), ADE_estimate = c(), - ACME_pval = c(), ADE_pval = c(), Model = list() - ) - - # Set initial index + + # Create template list of models + models <- list() + # Set initial index i <- 0 + for( mediator in mediators ){ - # Update index + # Update index i <- i + 1 if( verbose ){ message("\rMediator ", i, " out of ", @@ -264,43 +282,47 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), family = family, mat = mat, covariates = covariates, ... ) - # Update list of results - results <- .update.results(results, med_out, mediator) + # Update list of models + models <- c(models, list(med_out)) } + + # Name models by mediators + names(models) <- mediators # Combine results into dataframe - med_df <- .make.output(results, p.adj.method, add.metadata) + med_df <- .make_output(models, p.adj.method, add.metadata, sort) + return(med_df) } ) # Check that arguments can be passed to mediate and remove unused samples #' @importFrom stats na.omit -.check.mediate.args <- function( +.check_mediate_args <- function( x, outcome, treatment, mediator, covariates, verbose = TRUE, ...) { - + # Create dataframe from selected columns of colData df <- as.data.frame(colData(x)[ , names(colData(x)) %in% c( outcome, treatment, mediator, covariates)]) # Store kwargs into variable kwargs <- list(...) - + # Remove missing data from df df <- na.omit(df) diff <- ncol(x) - nrow(df) - + if( diff != 0 ){ # Remove missing data from se x <- x[ , rownames(df)] - + if( verbose ){ message(diff, " samples removed because of missing data.") } } - + # If boot is TRUE and treatment variable is discrete and has 3+ levels if( !is.null(kwargs[["boot"]]) && !is.numeric(df[[treatment]]) && length(unique((df[[treatment]]))) > 2 ) { - + ## if control and treat value are not specified if( any(vapply(kwargs[c("control.value", "treat.value")], is.null, logical(1))) ){ @@ -318,17 +340,17 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), "the treatment variable.", call. = FALSE ) } - + # Find indices of samples that belong to either control or treatment keep <- df[[treatment]] %in% kwargs[c("control.value", "treat.value")] - + # Remove samples different from control and treatment from df df <- df[keep, ] diff <- ncol(x) - nrow(df) - + # Remove samples different from control and treatment from se x <- x[ , rownames(df)] - + if( verbose ){ message( diff, " samples removed because different ", @@ -340,14 +362,14 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), } # Run mediation analysis -#' @importFrom mediation mediate #' @importFrom stats lm formula glm .run_mediate <- function(x, outcome, treatment, mediator = NULL, mat = NULL, family = gaussian(), covariates = NULL, ...) { + .require_package("mediation") # Create initial dataframe with outcome and treatment variables df <- data.frame( Outcome = colData(x)[[outcome]], Treatment = colData(x)[[treatment]]) - + if( is.null(mat) ){ # If matrix not given, fetch mediator from colData df[["Mediator"]] <- colData(x)[[mediator]] @@ -355,12 +377,12 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), # If matrix given, use it as mediators df[["Mediator"]] <- mat[mediator, ] } - + # Define basic formula mediation model relation_m <- "Mediator ~ Treatment" # Define basic formula outcome model relation_dv <- "Outcome ~ Treatment + Mediator" - + if( !is.null(covariates) ){ # Fetch covariates from colData and store them in dataframe df <- cbind(df, colData(x)[covariates]) @@ -384,53 +406,59 @@ setMethod("getMediation", signature = c(x = "SummarizedExperiment"), list(formula = formula(relation_dv), family = family, data = df) ) # Run mediation analysis - med_out <- mediate( + med_out <- mediation::mediate( fit_m, fit_dv, treat = "Treatment", mediator = "Mediator", covariates = covariates, ... ) - return(med_out) -} - -# Update list of results -.update.results <- function(results, med_out, mediator) { - # Update model variables - results[["Mediator"]] <- c(results[["Mediator"]], mediator) - # Update stats of ACME (average causal mediation effect) - results[["ACME_estimate"]] <- c(results[["ACME_estimate"]], med_out$d.avg) - results[["ACME_pval"]] <- c(results[["ACME_pval"]], med_out$d.avg.p) - # Update stats of ADE (average direct effect) - results[["ADE_estimate"]] <- c(results[["ADE_estimate"]], med_out$z.avg) - results[["ADE_pval"]] <- c(results[["ADE_pval"]], med_out$z.avg.p) - # Add current model to metadata - results[["Model"]][[length(results[["Model"]]) + 1]] <- med_out - return(results) + return(med_out) } # Combine results into output dataframe -.make.output <- function(results, p.adj.method, add.metadata) { - - # Create dataframe with model variables, effect sizes and p-values - med_df <- do.call(data.frame, results[seq_len(length(results) - 1)]) - +#' @importFrom dplyr select +#' @importFrom tidyr starts_with ends_with unnest_wider +#' @importFrom stringr str_extract str_replace_all +#' @importFrom stats p.adjust +.make_output <- function(models, p.adj.method, add.metadata, sort) { + # Combine results + res <- do.call(rbind, models) |> as.data.frame() + res[["mediator"]] <- names(models) + # Select certain data types + res <- res |> + select(mediator, starts_with(c("d.avg", "z.avg", "tau"))) |> + select(-ends_with(c("sims"))) + # Get columns with scalar values and turn them into vector instead of list + cols <- vapply(res, function(col) all(lengths(col) == 1L), logical(1L)) + res[, cols] <- lapply(res[, cols], unlist) + # Unnest confidence interval columns + res <- res |> unnest_wider(ends_with(".ci"), names_sep = "_") + # Replace confidence interval values with "lower" and "upper" + limits <- str_extract(colnames(res), "([0-9.]+)%") |> unique() + limits <- limits[!is.na(limits)] + limits <- limits[order(as.numeric(gsub("%", "", limits)))] + lookup <- c("lower", "upper") + names(lookup) <- limits + colnames(res) <- str_replace_all(colnames(res), lookup) + # Tidy other names also + lookup <- c("d.avg" = "acme", "z.avg" = "ade", "tau" = "total", + "\\.p" = "_pval", "\\.ci_" = "_", "\\.coef" = "") + colnames(res) <- str_replace_all(colnames(res), lookup) + # Compute adjusted p-values and add them to dataframe - med_df[["ACME_pval"]] <- p.adjust( - med_df[["ACME_pval"]], - method = p.adj.method - ) - med_df[["ADE_pval"]] <- p.adjust( - med_df[["ADE_pval"]], - method = p.adj.method - ) - + cols <- endsWith(colnames(res), "pval") + pcols <- lapply(res[ , cols], p.adjust, method = p.adj.method) + pcols <- do.call(cbind, pcols) + colnames(pcols) <- gsub("pval", "padj", colnames(pcols)) + res <- cbind(res, pcols) + + # Store model for each mediator into the model_metadata attribute if( add.metadata ){ - # models for every mediator are saved into the metadata attribute - attr(med_df, "metadata") <- results[["Model"]] + attr(res, "model_metadata") <- models } - # Order output dataframe by ACME p-values - med_df <- med_df[order(med_df[["ACME_pval"]]), ] - - return(med_df) + if( sort ){ + res <- res[order(res[["acme_padj"]]), ] + } + return(res) } diff --git a/man/getMediation.Rd b/man/getMediation.Rd index ad2ff1269..2f69d7d3f 100644 --- a/man/getMediation.Rd +++ b/man/getMediation.Rd @@ -22,7 +22,7 @@ getMediation(x, ...) family = gaussian(), covariates = NULL, p.adj.method = "holm", - add.metadata = FALSE, + add.metadata = TRUE, verbose = TRUE, ... ) @@ -37,7 +37,8 @@ getMediation(x, ...) family = gaussian(), covariates = NULL, p.adj.method = "holm", - add.metadata = FALSE, + add.metadata = TRUE, + sort = FALSE, verbose = TRUE, ... ) @@ -80,10 +81,13 @@ of p-values. Passed to \code{p.adjust} function. (Default: \code{"holm"})} \item{add.metadata}{\code{Logical scalar}. Should the model metadata be -returned. (Default: \code{FALSE})} +returned. (Default: \code{TRUE})} \item{verbose}{\code{Logical scalar}. Should execution messages be printed. (Default: \code{TRUE})} + +\item{sort}{\code{Logical scalar}. Should the results be sorted by decreasing +significance in terms of ACME_pval. (Default: \code{FALSE})} } \value{ \code{getMediation} returns a \code{data.frame} of adjusted p-values and @@ -91,15 +95,30 @@ effect sizes for the ACMEs and ADEs of every mediator given as input, whereas \code{addMediation} returns an updated \code{\link[SummarizedExperiment:SummarizedExperiment-class]{SummarizedExperiment}} instance with the same \code{data.frame} stored in the metadata as -"mediation". Its columns include: +"mediation" or as specified in the \code{name} argument. Its columns include: \describe{ -\item{Mediator}{the mediator variable} -\item{ACME_estimate}{the Average Causal Mediation Effect (ACME) estimate} -\item{ADE_estimate}{the Average Direct Effect (ADE) estimate} -\item{ACME_pval}{the adjusted p-value for the ACME estimate} -\item{ADE_pval}{the adjusted p-value for the ADE estimate} +\item{mediator}{the mediator variable} +\item{acme}{the Average Causal Mediation Effect (ACME) estimate} +\item{acme_pval}{the original p-value for the ACME estimate} +\item{acme_lower}{the lower bound of the CI for the ACME estimate} +\item{acme_upper}{the upper bound of the CI for the ACME estimate} +\item{ade}{the Average Direct Effect (ADE) estimate} +\item{ade_pval}{the original p-value for the ADE estimate} +\item{ade_lower}{the lower bound of the CI for the ADE estimate} +\item{ade_upper}{the upper bound of the CI for the ADE estimate} +\item{total}{the Total Effect estimate} +\item{total_lower}{the lower bound of the CI for the Total Effect estimate} +\item{total_upper}{the upper bound of the CI for the Total Effect estimate} +\item{total_pval}{the original p-value for the Total Effect estimate} +\item{acme_padj}{the adjusted p-value for the ACME estimate} +\item{ade_padj}{the adjusted p-value for the ADE estimate} +\item{total_padj}{the adjusted p-value for the Total Effect estimate} } + +The original output of \code{\link[mediation:mediate]{mediate}} for each +analysed mediator is stored as the "model_metadata" attribute of the +resulting \code{data.frame} and can be accessed via the \code{attr} function. } \description{ \code{getMediation} and \code{addMediation} provide a wrapper of @@ -119,12 +138,13 @@ Importantly, those three arguments are mutually exclusive. \dontrun{ # Import libraries library(mia) +library(miaViz) library(scater) # Load dataset data(hitchip1006, package = "miaTime") tse <- hitchip1006 - + # Agglomerate features by family (merely to speed up execution) tse <- agglomerateByRank(tse, rank = "Phylum") # Convert BMI variable to numeric @@ -132,59 +152,61 @@ tse$bmi_group <- as.numeric(tse$bmi_group) # Analyse mediated effect of nationality on BMI via alpha diversity # 100 permutations were done to speed up execution, but ~1000 are recommended -med_df <- getMediation(tse, - outcome = "bmi_group", - treatment = "nationality", - mediator = "diversity", - covariates = c("sex", "age"), - treat.value = "Scandinavia", - control.value = "CentralEurope", - boot = TRUE, sims = 100, - add.metadata = TRUE) - - # Visualise model statistics for 1st mediator - plot(attr(med_df, "metadata")[[1]]) +med_df <- getMediation( + tse, + outcome = "bmi_group", + treatment = "nationality", + mediator = "diversity", + covariates = c("sex", "age"), + treat.value = "Scandinavia", + control.value = "CentralEurope", + boot = TRUE, sims = 100) + +# Visualise model statistics +plotMediation(med_df) # Apply clr transformation to counts assay -tse <- transformAssay(tse, - method = "clr", - pseudocount = 1) +tse <- transformAssay(tse, method = "clr", pseudocount = 1) # Analyse mediated effect of nationality on BMI via clr-transformed features # 100 permutations were done to speed up execution, but ~1000 are recommended -tse <- addMediation(tse, name = "assay_mediation", - outcome = "bmi_group", - treatment = "nationality", - assay.type = "clr", - covariates = c("sex", "age"), - treat.value = "Scandinavia", - control.value = "CentralEurope", - boot = TRUE, sims = 100, - p.adj.method = "fdr") +tse <- addMediation( + tse, name = "assay_mediation", + outcome = "bmi_group", + treatment = "nationality", + assay.type = "clr", + covariates = c("sex", "age"), + treat.value = "Scandinavia", + control.value = "CentralEurope", + boot = TRUE, sims = 100, + p.adj.method = "fdr") # Show results for first 5 mediators head(metadata(tse)$assay_mediation, 5) - + # Perform ordination -tse <- runMDS(tse, name = "MDS", - method = "euclidean", - assay.type = "clr", - ncomponents = 3) +tse <- runMDS( + tse, name = "MDS", method = "euclidean", + assay.type = "clr", ncomponents = 3) # Analyse mediated effect of nationality on BMI via NMDS components # 100 permutations were done to speed up execution, but ~1000 are recommended -tse <- addMediation(tse, name = "reddim_mediation", - outcome = "bmi_group", - treatment = "nationality", - dimred = "MDS", - covariates = c("sex", "age"), - treat.value = "Scandinavia", - control.value = "CentralEurope", - boot = TRUE, sims = 100, - p.adj.method = "fdr") +tse <- addMediation( + tse, name = "reddim_mediation", + outcome = "bmi_group", + treatment = "nationality", + dimred = "MDS", + covariates = c("sex", "age"), + treat.value = "Scandinavia", + control.value = "CentralEurope", + boot = TRUE, sims = 100, + p.adj.method = "fdr") # Show results for first 5 mediators head(metadata(tse)$reddim_mediation, 5) + +# Access model metadata +attr(metadata(tse)$reddim_mediation, "model_metadata") } } diff --git a/man/importBIOM.Rd b/man/importBIOM.Rd index ef2cf2bc4..563768409 100644 --- a/man/importBIOM.Rd +++ b/man/importBIOM.Rd @@ -116,7 +116,7 @@ biom_file <- system.file( # Clean artifacts from taxonomic data tse <- importBIOM(biom_file, artifact.rm = TRUE) - + } \seealso{ \code{\link[=importQIIME2]{importQIIME2}} diff --git a/man/splitOn.Rd b/man/splitOn.Rd index 7b7437aab..4f580e176 100644 --- a/man/splitOn.Rd +++ b/man/splitOn.Rd @@ -105,10 +105,10 @@ group. \examples{ data(GlobalPatterns) tse <- GlobalPatterns -# Split data based on SampleType. +# Split data based on SampleType. se_list <- splitOn(tse, group = "SampleType") -# List of SE objects is returned. +# List of SE objects is returned. se_list # Create arbitrary groups diff --git a/tests/testthat/test-mediate.R b/tests/testthat/test-mediate.R index 4ce89c923..6962b6a18 100644 --- a/tests/testthat/test-mediate.R +++ b/tests/testthat/test-mediate.R @@ -1,86 +1,89 @@ test_that("getMediation", { - + skip_if_not(require("miaTime", quietly = TRUE)) data("hitchip1006", package = "miaTime") tse <- hitchip1006 - + tse <- agglomerateByRank(tse, rank = "Phylum") tse$bmi_group <- as.numeric(tse$bmi_group) - + ### Batch 1: check errors when missing or wrong arguments ### expect_error( getMediation(tse, outcome = "bmi_group", treatment = "nationality", mediator = "diversity", assay.type = "counts"), "The arguments mediator, assay.type and dimred are mutually exclusive, but 2 were provided." ) - + expect_error( getMediation(tse, outcome = "wrong_name", treatment = "nationality", mediator = "diversity"), "wrong_name not found in colData(x).", fixed = TRUE ) - + expect_error( getMediation(tse, outcome = "bmi_group", treatment = "wrong_name", mediator = "diversity"), "wrong_name not found in colData(x).", fixed = TRUE ) - + expect_error( getMediation(tse, outcome = "bmi_group", treatment = "nationality", mediator = "wrong_name"), "wrong_name not found in colData(x).", fixed = TRUE ) - + expect_error( getMediation(tse, outcome = "bmi_group", treatment = "nationality", assay.type = "wrong_name"), - "wrong_name not found in assays(x).", fixed = TRUE + "'assay.type' must be a valid name of assays(x)", fixed = TRUE ) - + expect_error( getMediation(tse, outcome = "bmi_group", treatment = "nationality", dimred = "wrong_name"), "wrong_name not found in reducedDims(x).", fixed = TRUE ) - + expect_error( getMediation(tse, outcome = "bmi_group", treatment = "nationality", mediator = "diversity", boot = TRUE, sims = 1), "Too many treatment levels. Consider specifing a treat.value and a control.value" ) - + expect_error( getMediation(tse, outcome = "bmi_group", treatment = "nationality", mediator = "diversity", boot = TRUE, sims = 1, treat.value = "wrong_value", control.value = "wrong_value"), "treat.value and/or control.value not found in the levels of the treatment variable." ) - + + skip_if_not_installed("mediation") ### Batch 2: check equality between base function and wrapper ### set.seed(123) med_df <- getMediation(tse, outcome = "bmi_group", treatment = "nationality", mediator = "diversity", treat.value = "Scandinavia", control.value = "CentralEurope", boot = TRUE, sims = 1, add.metadata = TRUE) - + df <- data.frame(Outcome = tse$bmi_group, Treatment = tse$nationality, Mediator = tse$diversity) df <- na.omit(df) df <- df[df$Treatment %in% c("Scandinavia", "CentralEurope"), ] fit_m <- lm(Mediator ~ Treatment, data = df) fit_dv <- glm(Outcome ~ Treatment + Mediator, data = df) - + set.seed(123) med_out <- mediate(fit_m, fit_dv, treat = "Treatment", mediator = "Mediator", treat.value = "Scandinavia", control.value = "CentralEurope", boot = TRUE, sims = 1) - - expect_equal(attr(med_df, "metadata")[[1]]$d.avg, med_out$d.avg) - expect_equal(attr(med_df, "metadata")[[1]]$d.avg.p, med_out$d.avg.p) - expect_equal(attr(med_df, "metadata")[[1]]$z.avg, med_out$z.avg) - expect_equal(attr(med_df, "metadata")[[1]]$z.avg.p, med_out$z.avg.p) - + + expect_equal(attr(med_df, "model_metadata")[[1]][["d.avg"]], med_out$d.avg) + expect_equal(attr(med_df, "model_metadata")[[1]][["d.avg.p"]], med_out$d.avg.p) + expect_equal(attr(med_df, "model_metadata")[[1]][["z.avg"]], med_out$z.avg) + expect_equal(attr(med_df, "model_metadata")[[1]][["z.avg.p"]], med_out$z.avg.p) + ### Batch 3: check output format and dimensionality with respect to SE ### med_df <- getMediation(tse, outcome = "bmi_group", treatment = "nationality", assay.type = "counts", treat.value = "Scandinavia", control.value = "CentralEurope", boot = TRUE, sims = 1) - - expect_named(tse, med_df[["Mediator"]]) - - expect_named(med_df, c("Mediator", "ACME_estimate", "ADE_estimate", "ACME_pval", "ADE_pval")) + expect_named(tse, med_df[["mediator"]]) + + expect_named(med_df, c("mediator", "acme", "acme_pval", "acme_lower", + "acme_upper", "ade", "ade_pval", "ade_lower", "ade_upper", "total", + "total_lower", "total_upper", "total_pval", "acme_padj", "ade_padj", + "total_padj")) })