diff --git a/DESCRIPTION b/DESCRIPTION index 16ded79..254d0cf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: scLANE Type: Package Title: Model Gene Expression Dynamics with Spline-Based NB GLMs, GEEs, & GLMMs -Version: 0.8.6 +Version: 0.8.7 Authors@R: c(person(given = c("Jack", "R."), family = "Leary", email = "j.leary@ufl.edu", role = c("aut", "cre"), comment = c(ORCID = "0009-0004-8821-3269")), person(given = "Rhonda", family = "Bacher", email = "rbacher@ufl.edu", role = c("ctb", "fnd"), comment = c(ORCID = "0000-0001-5787-476X"))) Description: Our scLANE model uses truncated power basis spline models to build flexible, interpretable models of single cell gene expression over pseudotime or latent time. diff --git a/NAMESPACE b/NAMESPACE index 91f24bd..a3ad165 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -136,6 +136,7 @@ importFrom(stats,fitted.values) importFrom(stats,hclust) importFrom(stats,kmeans) importFrom(stats,logLik) +importFrom(stats,model.matrix) importFrom(stats,offset) importFrom(stats,p.adjust) importFrom(stats,p.adjust.methods) diff --git a/NEWS.md b/NEWS.md index fadef48..a054195 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# Changes in v0.8.7 + ++ Switched GEE fitting back to use `scale.fix = FALSE` and a fixed value for the Negative-binomial overdispersion parameter as it improves model fits. ++ Added option to use a Lagrange Multiplier (Score) test for GEE mode instead of the default Wald test. The relevant argument is `gee.test` in `testDynamic()`. ++ Updated documentation and some tests. ++ Added column called `Null_Fit_Notes` to output from `getResultsDE()` to describe when and how null models fail. This doesn't happen frequently, but it's good info to have when it does. + # Changes in v0.8.6 + Changed GEE fitting to use `scale.fix = TRUE` throughout the package, as it appears to be faster and more statistically efficient based on simulated data benchmarking. diff --git a/R/backward_sel_WIC.R b/R/backward_sel_WIC.R index 975051b..4ee5da7 100644 --- a/R/backward_sel_WIC.R +++ b/R/backward_sel_WIC.R @@ -7,7 +7,7 @@ #' @importFrom gamlss gamlss #' @importFrom geeM geem #' @importFrom MASS negative.binomial -#' @importFrom stats vcov +#' @importFrom stats vcov coef #' @param Y The response variable. Defaults to NULL. #' @param B_new The model matrix. Defaults to NULL. #' @param is.gee Is the model a GEE? Defaults to FALSE. @@ -36,8 +36,8 @@ backward_sel_WIC <- function(Y = NULL, fit <- geeM::geem(Y ~ B_new - 1, id = id.vec, corstr = cor.structure, - family = MASS::negative.binomial(theta.hat, link = "log"), - scale.fix = TRUE, + family = MASS::negative.binomial(50, link = "log"), + scale.fix = FALSE, sandwich = sandwich.var) wald_stat <- unname(summary(fit)$wald.test[-1])^2 } else { @@ -47,7 +47,7 @@ backward_sel_WIC <- function(Y = NULL, vcov_mat <- try({ stats::vcov(fit, type = "all") }, silent = TRUE) if (inherits(vcov_mat, "try-error")) { covmat_unscaled <- chol2inv(fit$mu.qr$qr[1:(fit$mu.df - fit$mu.nl.df), 1:(fit$mu.df - fit$mu.nl.df), drop = FALSE]) - wald_stat <- unname(coef(fit) / sqrt(diag(covmat_unscaled))^2)[-1] + wald_stat <- unname(stats::coef(fit) / sqrt(diag(covmat_unscaled))^2)[-1] } else { wald_stat <- unname((vcov_mat$coef / vcov_mat$se)[-c(1, length(vcov_mat$coef))]^2) } diff --git a/R/getResultsDE.R b/R/getResultsDE.R index a366436..0af82ba 100644 --- a/R/getResultsDE.R +++ b/R/getResultsDE.R @@ -27,7 +27,7 @@ getResultsDE <- function(test.dyn.res = NULL, fdr.cutoff = 0.01, n.cores = 2L) { # check inputs - if (is.null(test.dyn.res)) { stop("Please provide a result list.") } + if (is.null(test.dyn.res)) { stop("Please provide a result list from testDynamic().") } if (!p.adj.method %in% stats::p.adjust.methods) { stop("Please choose a valid p-value adjustment method.") } # set up parallel processing if (n.cores > 1L) { @@ -40,7 +40,7 @@ getResultsDE <- function(test.dyn.res = NULL, function(x) { purrr::map_dfr(x, function(y) { - as.data.frame(rbind(y[seq(13)])) %>% + as.data.frame(rbind(y[seq(14)])) %>% dplyr::mutate(dplyr::across(tidyselect::everything(), \(z) unname(unlist(z)))) }) }) %>% diff --git a/R/marge2.R b/R/marge2.R index f2ff978..791cc67 100644 --- a/R/marge2.R +++ b/R/marge2.R @@ -24,7 +24,7 @@ #' @param cor.structure If \code{is.gee = TRUE}, a string specifying the desired correlation structure for the NB GEE. Defaults to "ar1". #' @param sandwich.var (Optional) Should the sandwich variance estimator be used instead of the model-based estimator? Default to FALSE. #' @param approx.knot (Optional) Should the set of candidate knots be subsampled in order to speed up computation? This has little effect on the final fit, but can improve computation time somewhat. Defaults to TRUE. -#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 50. +#' @param n.knot.max (Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 25. #' @param glm.backend (Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS". #' @param tols_score (Optional) The set tolerance for monitoring the convergence for the difference in score statistics between the parent and candidate model (this is the lack-of-fit criterion used for MARGE). Defaults to 0.00001. #' @param minspan (Optional) A set minimum span value. Defaults to NULL. @@ -63,7 +63,7 @@ marge2 <- function(X_pred = NULL, cor.structure = "ar1", sandwich.var = FALSE, approx.knot = TRUE, - n.knot.max = 50, + n.knot.max = 25, glm.backend = "MASS", tols_score = 1e-5, minspan = NULL, @@ -84,12 +84,6 @@ marge2 <- function(X_pred = NULL, } q <- ncol(X_pred) # Number of predictor variables B <- as.matrix(rep(1, NN)) # Start with the intercept model. - # estimate "known" theta for GEE models - if (is.gee || !is.glmm) { - theta_hat <- MASS::theta.mm(y = Y, - mu = mean(Y), - dfr = NN - 1) - } pen <- 2 # penalty for GCV criterion -- could also switch to log(N) later colnames(B) <- "Intercept" @@ -110,6 +104,7 @@ marge2 <- function(X_pred = NULL, breakFlag <- FALSE ok <- TRUE int.count <- 0 + theta_hat <- 1 while (ok) { # this is such egregiously bad code lol if (breakFlag) { @@ -167,6 +162,7 @@ marge2 <- function(X_pred = NULL, X_red <- seq(min(X_red), max(X_red), length.out = n.knot.max) + X_red <- round(X_red, 4) } } else { # original candidate knot selection from 2017 Stoklosa & Warton paper @@ -638,13 +634,13 @@ marge2 <- function(X_pred = NULL, } else { var_name_list1 <- vector("list") if (trunc.type == 2) { - B_temp <- cbind(B, b1_new, b2_new) # Additive model with both truncated basis functions. + B_temp <- cbind(B, b1_new, b2_new) # Additive model with both truncated basis functions. B_new <- cbind(b1_new, b2_new) B_names <- c(B_name1, B_name2) var_name_list1 <- c(var_name_list1, list(var_name)) var_name_list1 <- c(var_name_list1, list(var_name)) # Repeat it because there are two new truncated basis function in the set. } else { - B_temp <- cbind(B, b1_new) # Additive model with one truncated basis function (i.e., the positive part). + B_temp <- cbind(B, b1_new) # Additive model with one truncated basis function (i.e., the positive part). B_new <- b1_new B_names <- B_name1 var_name_list1 <- c(var_name_list1, list(var_name)) @@ -777,7 +773,7 @@ marge2 <- function(X_pred = NULL, colnames(wic_mat_2)[(ncol_B + 1)] <- "Forward pass model" wic_mat_2[1, (ncol_B + 1)] <- full.wic - wic1_2 <- backward_sel_WIC(Y = Y, B_new = B_new) + wic1_2 <- backward_sel_WIC(Y, B_new = B_new) wic_mat_2[2, 2:(length(wic1_2) + 1)] <- wic1_2 WIC_2 <- sum(apply(wic_mat_2[seq(2), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new) WIC_vec_2 <- c(WIC_vec_2, WIC_2) @@ -787,8 +783,7 @@ marge2 <- function(X_pred = NULL, B_new_2 <- as.matrix(B_new[, -(variable.lowest_2 + 1)]) cnames_2 <- c(cnames_2, list(colnames(B_new_2))) for (i in 2:(ncol_B - 1)) { - wic1_2 <- backward_sel_WIC(Y = Y, - B_new = B_new_2) + wic1_2 <- backward_sel_WIC(Y, B_new = B_new_2) if (i != (ncol_B - 1)) { wic_mat_2[(i + 1), colnames(B_new_2)[-1]] <- wic1_2 WIC_2 <- sum(apply(wic_mat_2[seq(i + 1), ], 1, min, na.rm = TRUE)) + 2 * ncol(B_new_2) @@ -837,9 +832,9 @@ marge2 <- function(X_pred = NULL, final_mod <- geeM::geem(model_formula, data = model_df, id = id.vec, - family = MASS::negative.binomial(theta_hat, link = log), + family = MASS::negative.binomial(50, link = log), corstr = cor.structure, - scale.fix = TRUE, + scale.fix = FALSE, sandwich = sandwich.var) } else { if (glm.backend == "MASS") { @@ -847,14 +842,14 @@ marge2 <- function(X_pred = NULL, data = model_df, method = "glm.fit2", link = log, - init.theta = theta_hat, + init.theta = 1, y = FALSE, model = FALSE) final_mod <- stripGLM(glm.obj = final_mod) } else if (glm.backend == "speedglm") { final_mod <- speedglm::speedglm(model_formula, data = model_df, - family = MASS::negative.binomial(theta_hat, link = "log"), + family = MASS::negative.binomial(50, link = log), trace = FALSE, model = FALSE, y = FALSE, diff --git a/R/plotModelCoefs.R b/R/plotModelCoefs.R index c1e0d60..6ff107a 100644 --- a/R/plotModelCoefs.R +++ b/R/plotModelCoefs.R @@ -24,7 +24,7 @@ #' cell_offset <- createCellOffset(sim_counts) #' scLANE_de_res <- getResultsDE(scLANE_models) #' plotModelCoefs(scLANE_models, -#' gene = scLANE_de_res$Gene[1], +#' gene = "ACLY", #' pt = sim_pseudotime, #' expr.mat = sim_counts, #' size.factor.offset = cell_offset) diff --git a/R/pullMARGESummary.R b/R/pullMARGESummary.R index 2e76b99..166158a 100644 --- a/R/pullMARGESummary.R +++ b/R/pullMARGESummary.R @@ -20,7 +20,7 @@ pullMARGESummary <- function(marge.model = NULL, sandwich.var = FALSE, is.glmm = FALSE) { # check inputs - if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary") } + if (is.null(marge.model)) { stop("A null model must be provided to pullMARGESummary().") } # handle the degenerate case if (inherits(marge.model, "try-error")) { res <- list(marge_pred_df = NULL, diff --git a/R/pullNullSummary.R b/R/pullNullSummary.R index 8e02e6b..1b0de12 100644 --- a/R/pullNullSummary.R +++ b/R/pullNullSummary.R @@ -20,7 +20,7 @@ pullNullSummary <- function(null.model = NULL, sandwich.var = FALSE, is.glmm = FALSE) { # check inputs - if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary") } + if (is.null(null.model)) { stop("A null model must be provided to pullNullSummary().") } # handle the degenerate case if (inherits(null.model, "try-error")) { res <- list(null_sumy_df = NULL, @@ -73,7 +73,8 @@ pullNullSummary <- function(null.model = NULL, res <- list(null_sumy_df = null_sumy_df, null_pred_df = null_pred_df, null_ll = ifelse(is.gee, NA_real_, as.numeric(stats::logLik(null.model))), - null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model))) + null_dev = ifelse(is.gee, NA_real_, stats::deviance(null.model)), + null_fit_notes = NA_character_) } return(res) } diff --git a/R/scoreTestGEE.R b/R/scoreTestGEE.R new file mode 100644 index 0000000..801aa70 --- /dev/null +++ b/R/scoreTestGEE.R @@ -0,0 +1,70 @@ +#' Use a Lagrange multiplier (score) test to compare nested GEE models. +#' +#' @name scoreTestGEE +#' @author Jack R. Leary +#' @description Performs a basic Lagrange multiplier test to determine whether an alternate model is significantly better than a nested null model. This is the GEE equivalent (kind of) of \code{\link{modelLRT}}. Be careful with small sample sizes. +#' @importFrom stats model.matrix predict pchisq +#' @param mod.1 The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL. +#' @param mod.0 The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL. +#' @param alt.df The dataframe used to fit the alternative model. Defaults to NULL. +#' @param null.df The dataframe used to fit the null model. Defaults to NULL. +#' @param sandwich.var Boolean specifying whether the sandwich variance-covariance matrix should be used. Defaults to FALSE. +#' @return A list containing the Score test statistic, a \emph{p}-value, and the degrees of freedom used in the test. +#' @details +#' \itemize{ +#' \item Calculating the test statistic involves taking the inverse of the variance-covariance matrix of the coefficients. Ideally this would be done using the "true" inverse with something like \code{\link{solve}}, \code{\link{qr.solve}}, or \code{\link{chol2inv}}, but in practice this can cause issues when the variance-covariance matrix is near-singular. With this in mind, we use the Moore-Penrose pseudoinverse as implemented in \code{\link[MASS]{ginv}} instead, which leads to more stable results. +#' \item The \emph{p}-value is calculated using an asymptotic \eqn{\Chi^2} distribution, with the degrees of freedom equal to the number of non-intercept coefficients in the alternative model. +#' } +#' @seealso \code{\link[geeM]{geem}} +#' @seealso \code{\link{waldTestGEE}} +#' @seealso \code{\link{modelLRT}} + +scoreTestGEE <- function(mod.1 = NULL, + mod.0 = NULL, + alt.df = NULL, + null.df = NULL, + sandwich.var = FALSE) { + # check inputs + if (is.null(mod.1) || is.null(mod.0) || is.null(alt.df) || is.null(null.df)) { stop("Please provide all inputs to scoreTestGEE().") } + if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) { + res <- list(Score_Stat = NA_real_, + DF = NA_real_, + P_Val = NA_real_, + Notes = "No test performed due to model failure.") + return(res) + } + mod.1 <- mod.1$final_mod + if (!(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to scoreTestGee().") } + if (length(coef(mod.0)) != 1) { stop("Null GEE model must be intercept-only.") } + if (length(coef(mod.1)) <= length(coef(mod.0))) { + # can't calculate Score statistic if both models are intercept-only + res <- list(Score_Stat = 0, + DF = 0, + P_Val = 1, + Notes = NA_character_) + } else { + X_null <- stats::model.matrix(mod.0, data = null.df) + X_alt <- stats::model.matrix(mod.1, data = alt.df) + residuals_null <- null.df$Y - exp(stats::predict(mod.0)) + if (sandwich.var) { + V_null <- as.matrix(mod.0$var) + } else { + V_null <- as.matrix(mod.0$naiv.var) + } + X_alt_t <- t(X_alt) + U <- X_alt_t %*% residuals_null + V_U <- (X_alt_t * as.numeric(V_null)) %*% X_alt + V_U_inv <- try({ eigenMapMatrixInvert(V_U) }, silent = TRUE) + if (inherits(V_U_inv, "try-error")) { + V_U_inv <- eigenMapPseudoInverse(V_U) + } + test_stat <- as.numeric(t(U) %*% V_U_inv %*% U) + df1 <- ncol(X_alt) - ncol(X_null) + p_value <- 1 - stats::pchisq(test_stat, df = df1) + res <- list(Score_Stat = test_stat, + DF = df1, + P_Val = p_value, + Notes = NA_character_) + } + return(res) +} diff --git a/R/stat_out_score_gee_null.R b/R/stat_out_score_gee_null.R index 667bb20..5097907 100644 --- a/R/stat_out_score_gee_null.R +++ b/R/stat_out_score_gee_null.R @@ -35,10 +35,9 @@ stat_out_score_gee_null <- function(Y = NULL, id = id.vec, data = NULL, corstr = cor.structure, - family = MASS::negative.binomial(theta.hat), - scale.fix = TRUE, - sandwich = FALSE, - maxit = 10) + family = MASS::negative.binomial(50, link = "log"), + scale.fix = FALSE, + sandwich = FALSE) alpha_est <- ests$alpha sigma_est <- ests$phi mu_est <- as.matrix(stats::fitted.values(ests)) diff --git a/R/summary.R b/R/summary.R index dd26574..98e4bdd 100644 --- a/R/summary.R +++ b/R/summary.R @@ -31,7 +31,7 @@ summary.scLANE <- function(test.dyn.res = NULL, summary_stats$mean_adj_p_value <- mean(adj_p_values, na.rm = TRUE) summary_stats$test_type <- ifelse(test.dyn.res[[1]][[1]]$Test_Stat_Type == "LRT", "Likelihood Ratio Test", - "Wald Test") + ifelse(test.dyn.res[[1]][[1]]$Test_Stat_Type == "Wald", "Wald Test", "Score Test")) class(summary_stats) <- "summary.scLANE" return(summary_stats) } diff --git a/R/testDynamic.R b/R/testDynamic.R index 9bbf015..2a2d77e 100644 --- a/R/testDynamic.R +++ b/R/testDynamic.R @@ -12,7 +12,7 @@ #' @importFrom doSNOW registerDoSNOW #' @importFrom parallel makeCluster stopCluster clusterEvalQ clusterExport clusterSetRNGStream #' @importFrom withr with_output_sink -#' @importFrom MASS glm.nb negative.binomial theta.mm +#' @importFrom MASS glm.nb negative.binomial #' @importFrom dplyr rename mutate relocate #' @importFrom purrr imap reduce #' @importFrom stats predict logLik deviance offset @@ -25,6 +25,7 @@ #' @param is.gee Should a GEE framework be used instead of the default GLM? Defaults to FALSE. #' @param cor.structure If the GEE framework is used, specifies the desired working correlation structure. Must be one of "ar1", "independence", or "exchangeable". Defaults to "ar1". #' @param gee.bias.correction.method (Optional) Specify which small-sample bias correction to be used on the sandwich variance-covariance matrix prior to test statistic estimation. Options are "kc" and "df". Defaults to NULL, indicating the use of the model-based variance. +#' @param gee.test A string specifying the type of test used to estimate the significance of the full model. Must be one of "wald" or "score". Defaults to "wald". #' @param is.glmm Should a GLMM framework be used instead of the default GLM? Defaults to FALSE. #' @param id.vec If a GEE or GLMM framework is being used, a vector of subject IDs to use as input to \code{\link[geeM]{geem}} or \code{\link[glmmTMB]{glmmTMB}}. Defaults to NULL. #' @param glmm.adaptive (Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to TRUE. @@ -63,6 +64,7 @@ testDynamic <- function(expr.mat = NULL, is.gee = FALSE, cor.structure = "ar1", gee.bias.correction.method = NULL, + gee.test = "wald", is.glmm = FALSE, glmm.adaptive = TRUE, id.vec = NULL, @@ -113,7 +115,9 @@ testDynamic <- function(expr.mat = NULL, if ((is.gee || is.glmm) && is.unsorted(id.vec)) { stop("Your data must be ordered by subject, please do so before running testDynamic() with the GEE / GLMM backends.") } cor.structure <- tolower(cor.structure) if (is.gee && !(cor.structure %in% c("ar1", "independence", "exchangeable"))) { stop("GEE models require a specified correlation structure.") } - + # check GEE testing method + gee.test <- tolower(gee.test) + if (is.gee & !gee.test %in% c("wald", "score")) { stop("GEE testing method must be one of score or wald.") } # set up time tracking start_time <- Sys.time() @@ -143,7 +147,7 @@ testDynamic <- function(expr.mat = NULL, is_read_only = TRUE) # build list of objects to prevent from being sent to parallel workers - necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "gee.bias.correction.method", + necessary_vars <- c("expr.mat", "genes", "pt", "n.potential.basis.fns", "approx.knot", "is.glmm", "gee.bias.correction.method", "gee.test", "verbose", "n_lineages", "id.vec", "cor.structure", "is.gee", "gee.scale.fix", "glmm.adaptive", "size.factor.offset") if (any(ls(envir = .GlobalEnv) %in% necessary_vars)) { no_export <- c(ls(envir = .GlobalEnv)[-which(ls(envir = .GlobalEnv) %in% necessary_vars)], @@ -212,11 +216,9 @@ testDynamic <- function(expr.mat = NULL, } # build formula for null model - null_mod_df <- data.frame(Y_null = expr.mat[lineage_cells, i], - Intercept = 1) + null_mod_df <- data.frame(Y_null = expr.mat[lineage_cells, i], Intercept = 1) if (!is.null(id.vec)) { - null_mod_df <- dplyr::mutate(null_mod_df, - subject = id.vec[lineage_cells]) + null_mod_df <- dplyr::mutate(null_mod_df, subject = id.vec[lineage_cells]) } if (is.glmm) { null_mod_formula <- "Y_null ~ (1 | subject)" @@ -224,24 +226,20 @@ testDynamic <- function(expr.mat = NULL, null_mod_formula <- "Y_null ~ -1 + Intercept" } if (!is.null(size.factor.offset)) { - null_mod_df <- dplyr::mutate(null_mod_df, - n_offset = size.factor.offset[lineage_cells]) + null_mod_df <- dplyr::mutate(null_mod_df, n_offset = size.factor.offset[lineage_cells]) null_mod_formula <- paste0(null_mod_formula, " + offset(log(1 / n_offset))") } null_mod_formula <- stats::as.formula(null_mod_formula) - # fit null model for comparison via Wald or LR test + # fit null model for comparison via Wald, Score, or LR test if (is.gee) { - theta_hat <- MASS::theta.mm(y = null_mod_df$Y_null, - mu = mean(null_mod_df$Y_null), - dfr = length(null_mod_df$subject) - 1) null_mod <- try({ geeM::geem(null_mod_formula, id = null_mod_df$subject, data = null_mod_df, - family = MASS::negative.binomial(theta_hat), + family = MASS::negative.binomial(50, link = log), corstr = cor.structure, - scale.fix = TRUE, + scale.fix = FALSE, sandwich = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE)) }, silent = TRUE) } else if (is.glmm) { @@ -252,22 +250,19 @@ testDynamic <- function(expr.mat = NULL, se = TRUE) }, silent = TRUE) } else { - theta_hat <- MASS::theta.mm(y = null_mod_df$Y_null, - mu = mean(null_mod_df$Y_null), - dfr = length(null_mod_df$Y_null) - 1) null_mod <- try({ MASS::glm.nb(null_mod_formula, data = null_mod_df, method = "glm.fit2", y = FALSE, model = FALSE, - init.theta = theta_hat, + init.theta = 1, link = log) }, silent = TRUE) null_mod <- stripGLM(null_mod) } - # record model fit status + # record model fit status for both models if (inherits(marge_mod, "try-error")) { if (inherits(null_mod, "try-error")) { mod_status <- "MARGE model error, null model error" @@ -291,7 +286,7 @@ testDynamic <- function(expr.mat = NULL, sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE), is.glmm = is.glmm) - # perform slope test + # generate data for slope test marge_slope_df <- createSlopeTestData(marge_mod, pt = pt[lineage_cells, j, drop = FALSE], is.gee = is.gee, @@ -325,7 +320,9 @@ testDynamic <- function(expr.mat = NULL, lineage_list[[j]] <- list(Gene = genes[i], Lineage = LETTERS[j], Test_Stat = NA_real_, - Test_Stat_Type = ifelse(is.gee, "Wald", "LRT"), + Test_Stat_Type = ifelse(is.gee, + ifelse(gee.test == "wald", "Wald", "Score"), + "LRT"), Test_Stat_Note = NA_character_, P_Val = NA_real_, LogLik_MARGE = marge_sumy$ll_marge, @@ -334,6 +331,7 @@ testDynamic <- function(expr.mat = NULL, Dev_Null = null_sumy$null_dev, Model_Status = mod_status, MARGE_Fit_Notes = marge_sumy$marge_fit_notes, + Null_Fit_Notes = null_sumy$null_fit_notes, Gene_Time = gene_time_end_numeric, MARGE_Summary = marge_sumy$marge_sumy_df, Null_Summary = null_sumy$null_sumy_df, @@ -344,18 +342,28 @@ testDynamic <- function(expr.mat = NULL, # compute test stat using asymptotic Chi-squared approximation if (is.gee) { - test_res <- waldTestGEE(mod.1 = marge_mod, - mod.0 = null_mod, - correction.method = gee.bias.correction.method, - id.vec = id.vec[lineage_cells], - verbose = verbose) + if (gee.test == "wald") { + test_res <- waldTestGEE(mod.1 = marge_mod, + mod.0 = null_mod, + correction.method = gee.bias.correction.method, + id.vec = id.vec[lineage_cells], + verbose = verbose) + } else if (gee.test == "score") { + test_res <- scoreTestGEE(mod.1 = marge_mod, + mod.0 = null_mod, + alt.df = as.data.frame(marge_mod$basis_mtx), + null.df = null_mod_df, + sandwich.var = ifelse(is.null(gee.bias.correction.method), FALSE, TRUE)) + } } else { test_res <- modelLRT(mod.1 = marge_mod, mod.0 = null_mod, is.glmm = is.glmm) } # add test stats to result list - lineage_list[[j]]$Test_Stat <- ifelse(is.gee, test_res$Wald_Stat, test_res$LRT_Stat) + lineage_list[[j]]$Test_Stat <- ifelse(is.gee, + ifelse(gee.test == "wald", test_res$Wald_Stat, test_res$Score_Stat), + test_res$LRT_Stat) lineage_list[[j]]$Test_Stat_Note <- test_res$Notes lineage_list[[j]]$P_Val <- test_res$P_Val } @@ -380,7 +388,9 @@ testDynamic <- function(expr.mat = NULL, list(Gene = y, Lineage = LETTERS[z], Test_Stat = NA_real_, - Test_Stat_Type = ifelse(is.gee, "Wald", "LRT"), + Test_Stat_Type = ifelse(is.gee, + ifelse(gee.test == "wald", "Wald", "Score"), + "LRT"), Test_Stat_Note = NA_character_, P_Val = NA_real_, LogLik_MARGE = NA_real_, @@ -390,6 +400,7 @@ testDynamic <- function(expr.mat = NULL, Model_Status = x[1], Gene_Time = NA_real_, MARGE_Fit_Notes = NA_character_, + Null_Fit_Notes = NA_character_, MARGE_Summary = NULL, Null_Summary = NULL, MARGE_Preds = NULL, diff --git a/R/waldTestGEE.R b/R/waldTestGEE.R index 6acbc43..b9e5eae 100644 --- a/R/waldTestGEE.R +++ b/R/waldTestGEE.R @@ -27,6 +27,7 @@ waldTestGEE <- function(mod.1 = NULL, id.vec = NULL, verbose = FALSE) { # check inputs + if (is.null(mod.1) || is.null(mod.0)) { stop("Please provide both models to waldTestGEE().") } if (inherits(mod.1, "try-error") || inherits(mod.0, "try-error")) { res <- list(Wald_Stat = NA_real_, DF = NA_real_, @@ -43,7 +44,7 @@ waldTestGEE <- function(mod.1 = NULL, } mod.1 <- mod.1$final_mod - if (is.null(mod.1) || is.null(mod.0) || !(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to waldTestGee().") } + if (!(inherits(mod.1, "geem") && inherits(mod.0, "geem"))) { stop("You must provide two geeM models to waldTestGee().") } if (length(coef(mod.0)) != 1) { stop("Null GEE model must be intercept-only.") } if (length(coef(mod.1)) <= length(coef(mod.0))) { # can't calculate Wald statistic if both models are intercept-only diff --git a/data/scLANE_models.rda b/data/scLANE_models.rda index fbec682..c668286 100644 Binary files a/data/scLANE_models.rda and b/data/scLANE_models.rda differ diff --git a/man/marge2.Rd b/man/marge2.Rd index 847124a..7a8b39c 100644 --- a/man/marge2.Rd +++ b/man/marge2.Rd @@ -15,7 +15,7 @@ marge2( cor.structure = "ar1", sandwich.var = FALSE, approx.knot = TRUE, - n.knot.max = 50, + n.knot.max = 25, glm.backend = "MASS", tols_score = 1e-05, minspan = NULL, @@ -45,7 +45,7 @@ marge2( \item{approx.knot}{(Optional) Should the set of candidate knots be subsampled in order to speed up computation? This has little effect on the final fit, but can improve computation time somewhat. Defaults to TRUE.} -\item{n.knot.max}{(Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 50.} +\item{n.knot.max}{(Optional) The maximum number of candidate knots to consider. Uses uniform sampling to select this number of unique values from the reduced set of all candidate knots. Defaults to 25.} \item{glm.backend}{(Optional) Character specifying which GLM-fitting backend should be used. Must be one of "MASS" or "speedglm". Defaults to "MASS".} diff --git a/man/plotModelCoefs.Rd b/man/plotModelCoefs.Rd index bdff591..f93b7ee 100644 --- a/man/plotModelCoefs.Rd +++ b/man/plotModelCoefs.Rd @@ -42,7 +42,7 @@ data(scLANE_models) cell_offset <- createCellOffset(sim_counts) scLANE_de_res <- getResultsDE(scLANE_models) plotModelCoefs(scLANE_models, - gene = scLANE_de_res$Gene[1], + gene = "ACLY", pt = sim_pseudotime, expr.mat = sim_counts, size.factor.offset = cell_offset) diff --git a/man/scoreTestGEE.Rd b/man/scoreTestGEE.Rd new file mode 100644 index 0000000..5ea410d --- /dev/null +++ b/man/scoreTestGEE.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/scoreTestGEE.R +\name{scoreTestGEE} +\alias{scoreTestGEE} +\title{Use a Lagrange multiplier (score) test to compare nested GEE models.} +\usage{ +scoreTestGEE( + mod.1 = NULL, + mod.0 = NULL, + alt.df = NULL, + null.df = NULL, + sandwich.var = FALSE +) +} +\arguments{ +\item{mod.1}{The model under the alternative hypothesis. Must be of class \code{geem}. Defaults to NULL.} + +\item{mod.0}{The model under the null hypothesis. Must be of class \code{geem}. Defaults to NULL.} + +\item{alt.df}{The dataframe used to fit the alternative model. Defaults to NULL.} + +\item{null.df}{The dataframe used to fit the null model. Defaults to NULL.} + +\item{sandwich.var}{Boolean specifying whether the sandwich variance-covariance matrix should be used. Defaults to FALSE.} +} +\value{ +A list containing the Score test statistic, a \emph{p}-value, and the degrees of freedom used in the test. +} +\description{ +Performs a basic Lagrange multiplier test to determine whether an alternate model is significantly better than a nested null model. This is the GEE equivalent (kind of) of \code{\link{modelLRT}}. Be careful with small sample sizes. +} +\details{ +\itemize{ +\item Calculating the test statistic involves taking the inverse of the variance-covariance matrix of the coefficients. Ideally this would be done using the "true" inverse with something like \code{\link{solve}}, \code{\link{qr.solve}}, or \code{\link{chol2inv}}, but in practice this can cause issues when the variance-covariance matrix is near-singular. With this in mind, we use the Moore-Penrose pseudoinverse as implemented in \code{\link[MASS]{ginv}} instead, which leads to more stable results. +\item The \emph{p}-value is calculated using an asymptotic \eqn{\Chi^2} distribution, with the degrees of freedom equal to the number of non-intercept coefficients in the alternative model. +} +} +\seealso{ +\code{\link[geeM]{geem}} + +\code{\link{waldTestGEE}} + +\code{\link{modelLRT}} +} +\author{ +Jack R. Leary +} diff --git a/man/testDynamic.Rd b/man/testDynamic.Rd index 34b34b8..3319a7f 100644 --- a/man/testDynamic.Rd +++ b/man/testDynamic.Rd @@ -12,6 +12,7 @@ testDynamic( is.gee = FALSE, cor.structure = "ar1", gee.bias.correction.method = NULL, + gee.test = "wald", is.glmm = FALSE, glmm.adaptive = TRUE, id.vec = NULL, @@ -37,6 +38,8 @@ testDynamic( \item{gee.bias.correction.method}{(Optional) Specify which small-sample bias correction to be used on the sandwich variance-covariance matrix prior to test statistic estimation. Options are "kc" and "df". Defaults to NULL, indicating the use of the model-based variance.} +\item{gee.test}{A string specifying the type of test used to estimate the significance of the full model. Must be one of "wald" or "score". Defaults to "wald".} + \item{is.glmm}{Should a GLMM framework be used instead of the default GLM? Defaults to FALSE.} \item{glmm.adaptive}{(Optional) Should the basis functions for the GLMM be chosen adaptively? If not, uses 4 evenly spaced knots. Defaults to TRUE.} diff --git a/tests/testthat/test_scLANE.R b/tests/testthat/test_scLANE.R index 7bfee2e..5fc60cd 100644 --- a/tests/testthat/test_scLANE.R +++ b/tests/testthat/test_scLANE.R @@ -127,14 +127,19 @@ withr::with_output_sink(tempfile(), { return.GCV = TRUE, return.WIC = TRUE) # fit null GEE model - null_mod_GEE <- geeM::geem(Y ~ 1 + offset(log(cell_offset)), - family = MASS::negative.binomial(1), + null_mod_GEE <- geeM::geem(Y ~ 1 + offset(log(1 / cell_offset)), + family = MASS::negative.binomial(50, link = log), data = data.frame(Y = counts_test[, 3]), corstr = "ar1", scale.fix = FALSE, sandwich = FALSE) # run GEE Wald test wald_test <- waldTestGEE(marge_mod_GEE_offset, mod.0 = null_mod_GEE) + # run GEE Score test + score_test <- scoreTestGEE(marge_mod_GEE_offset, + mod.0 = null_mod_GEE, + alt.df = as.data.frame(marge_mod_GEE_offset$basis_mtx), + null.df = data.frame(Y = counts_test[, 3])) # run GLMM model -- no offset glmm_mod <- fitGLMM(pt_test, Y = counts_test[, 4], @@ -386,6 +391,10 @@ test_that("Statistical testing output", { expect_length(wald_test, 4) expect_type(wald_test$P_Val, "double") expect_true(is.na(wald_test$Notes)) + expect_type(score_test, "list") + expect_length(score_test, 4) + expect_type(score_test$P_Val, "double") + expect_true(is.na(score_test$Notes)) expect_type(glm_lrt, "list") expect_length(glm_lrt, 7) expect_type(glm_lrt$P_Val, "double")