diff --git a/NEWS.md b/NEWS.md index 04a89c2..657045f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,8 +1,13 @@ # missRanger 2.6.0 -## Minor changes +## Minor changes in output object + +- Add original data as `data_raw`. +- Renamed `visit_seq` to `to_impute`. + +## Other changes -- Add original data to output. +- Improvement in the vignette on censored data. # missRanger 2.5.0 diff --git a/R/missRanger.R b/R/missRanger.R index 62ea58d..ed55906 100644 --- a/R/missRanger.R +++ b/R/missRanger.R @@ -62,13 +62,13 @@ #' `write.forest`, `probability`, `split.select.weights`, #' `dependent.variable.name`, and `classification`. #' @returns -#' If `data_only` an imputed `data.frame`. Otherwise, a "missRanger" object with -#' the following elements that can be extracted via `$`: +#' If `data_only = TRUE` an imputed `data.frame`. Otherwise, a "missRanger" object +#' with the following elements: #' - `data`: The imputed data. #' - `data_raw`: The original data provided. #' - `forests`: When `keep_forests = TRUE`, a list of "ranger" models used to #' generate the imputed data. `NULL` otherwise. -#' - `visit_seq`: Variables to be imputed (in this order). +#' - `to_impute`: Variables to be imputed (in this order). #' - `impute_by`: Variables used for imputation. #' - `best_iter`: Best iteration. #' - `pred_errors`: Per-iteration OOB prediction errors (1 - R^2 for regression, @@ -92,19 +92,17 @@ #' head(irisImputed) #' head(irisWithNA) #' -#' \dontrun{ #' # Extended output -#' imp <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100, data_only = FALSE) +#' imp <- missRanger(irisWithNA, pmm.k = 3, num.trees = 50, data_only = FALSE) #' head(imp$data) #' imp$pred_errors #' #' # If you even want to keep the random forests of the best iteration #' imp <- missRanger( -#' irisWithNA, pmm.k = 3, num.trees = 100, data_only = FALSE, keep_forests = TRUE +#' irisWithNA, pmm.k = 3, num.trees = 50, data_only = FALSE, keep_forests = TRUE #' ) #' imp$forests$Sepal.Width #' imp$pred_errors[imp$best_iter, "Sepal.Width"] # 1 - R-squared -#' } missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, seed = NULL, verbose = 1, returnOOB = FALSE, case.weights = NULL, data_only = TRUE, keep_forests = FALSE, ...) { @@ -153,7 +151,7 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, stats::terms.formula(stats::reformulate(z), data = data[1L, ]), "term.labels" ) - trimws(out, whitespace = "`") # Remove enclosing backticks + return(trimws(out, whitespace = "`")) # Remove enclosing backticks } relevant_vars <- lapply(formula[2:3], parsef) @@ -171,14 +169,14 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, data[, to_impute] <- converted$X # Remove variables that cannot be safely converted - visit_seq <- setdiff(to_impute, converted$bad) + to_impute <- setdiff(to_impute, converted$bad) if (verbose) { cat("\n Variables to impute:\t\t") - cat(visit_seq, sep = ", ") + cat(to_impute, sep = ", ") } - if (!length(visit_seq)) { + if (!length(to_impute)) { if (verbose) { cat("\n") } @@ -188,8 +186,9 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, out <- structure( list( data = data, + data_raw = data_raw, forests = NULL, - visit_seq = c(), + to_impute = c(), impute_by = c(), best_iter = 0L, pred_errors = NULL, @@ -202,16 +201,16 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, } # Get missing indicators and order variables by number of missings - data_NA <- is.na(data[, visit_seq, drop = FALSE]) - visit_seq <- names(sort(colSums(data_NA))) + data_NA <- is.na(data[, to_impute, drop = FALSE]) + to_impute <- names(sort(colSums(data_NA))) # 3) SELECT VARIABLES USED TO IMPUTE - # Variables on the rhs should either appear in "visit_seq" + # Variables on the rhs should either appear in "to_impute" # or do not contain any missings - impute_by <- relevant_vars[[2L]][relevant_vars[[2L]] %in% visit_seq | + impute_by <- relevant_vars[[2L]][relevant_vars[[2L]] %in% to_impute | !vapply(data[, relevant_vars[[2L]], drop = FALSE], anyNA, TRUE)] - completed <- setdiff(impute_by, visit_seq) + completed <- setdiff(impute_by, to_impute) if (verbose) { cat("\n Variables used to impute:\t") @@ -225,14 +224,14 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, j <- 1L crit <- TRUE dig <- 4L - pred_error <- stats::setNames(rep(1, length(visit_seq)), visit_seq) + pred_error <- stats::setNames(rep(1, length(to_impute)), to_impute) pred_errors <- list() if (keep_forests) { forests <- list() } if (verbose >= 2) { - cat("\n", abbreviate(visit_seq, minlength = dig + 2L), sep = "\t") + cat("\n", abbreviate(to_impute, minlength = dig + 2L), sep = "\t") } # Looping over iterations and variables to impute @@ -240,10 +239,8 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, if (verbose) { if (verbose == 1) { i <- 1L - cat("\n") - cat(paste("iter", j)) - cat("\n") - pb <- utils::txtProgressBar(0, length(visit_seq), style = 3) + cat("\niter", j, "\n") + pb <- utils::txtProgressBar(0, length(to_impute), style = 3) } else if (verbose >= 2) { cat("\niter ", j, ":\t", sep = "") } @@ -255,7 +252,7 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, forests_last <- forests } - for (v in visit_seq) { + for (v in to_impute) { v.na <- data_NA[, v] if (length(completed) == 0L) { @@ -338,7 +335,7 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, data = data_last, data_raw = data_raw, forests = if (keep_forests) forests_last, - visit_seq = visit_seq, + to_impute = to_impute, impute_by = impute_by, best_iter = best_iter, pred_errors = do.call(rbind, pred_errors), diff --git a/R/pmm.R b/R/pmm.R index 70ec638..5e6754f 100644 --- a/R/pmm.R +++ b/R/pmm.R @@ -16,11 +16,11 @@ #' @returns Vector of the same length as `xtest` with values from `xtrain`. #' @export #' @examples -#' pmm(xtrain = c(0.2, 0.2, 0.8), xtest = 0.3, ytrain = c(0, 0, 1)) # 0 -#' pmm(xtrain = c(TRUE, FALSE, TRUE), xtest = FALSE, ytrain = c(2, 0, 1)) # 0 -#' pmm(xtrain = c(0.2, 0.8), xtest = 0.3, ytrain = c("A", "B"), k = 2) # "A" or "B" -#' pmm(xtrain = c("A", "A", "B"), xtest = "A", ytrain = c(2, 2, 4), k = 2) # 2 -#' pmm(xtrain = factor(c("A", "B")), xtest = factor("C"), ytrain = 1:2) # 2 +#' pmm(xtrain = c(0.2, 0.2, 0.8), xtest = 0.3, ytrain = c(0, 0, 1)) +#' pmm(xtrain = c(TRUE, FALSE, TRUE), xtest = FALSE, ytrain = c(2, 0, 1)) +#' pmm(xtrain = c(0.2, 0.8), xtest = 0.3, ytrain = c("A", "B"), k = 2) +#' pmm(xtrain = c("A", "A", "B"), xtest = "A", ytrain = c(2, 2, 4), k = 2) +#' pmm(xtrain = factor(c("A", "B")), xtest = factor("C"), ytrain = 1:2) pmm <- function(xtrain, xtest, ytrain, k = 1L, seed = NULL) { stopifnot( length(xtrain) == length(ytrain), diff --git a/man/missRanger.Rd b/man/missRanger.Rd index 57e2f9b..edef677 100644 --- a/man/missRanger.Rd +++ b/man/missRanger.Rd @@ -64,14 +64,14 @@ better use less trees (e.g. \code{num.trees = 20}) and/or a low value of \code{dependent.variable.name}, and \code{classification}.} } \value{ -If \code{data_only} an imputed \code{data.frame}. Otherwise, a "missRanger" object with -the following elements that can be extracted via \code{$}: +If \code{data_only = TRUE} an imputed \code{data.frame}. Otherwise, a "missRanger" object +with the following elements: \itemize{ \item \code{data}: The imputed data. \item \code{data_raw}: The original data provided. \item \code{forests}: When \code{keep_forests = TRUE}, a list of "ranger" models used to generate the imputed data. \code{NULL} otherwise. -\item \code{visit_seq}: Variables to be imputed (in this order). +\item \code{to_impute}: Variables to be imputed (in this order). \item \code{impute_by}: Variables used for imputation. \item \code{best_iter}: Best iteration. \item \code{pred_errors}: Per-iteration OOB prediction errors (1 - R^2 for regression, @@ -113,20 +113,18 @@ irisImputed <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100) head(irisImputed) head(irisWithNA) -\dontrun{ # Extended output -imp <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100, data_only = FALSE) +imp <- missRanger(irisWithNA, pmm.k = 3, num.trees = 50, data_only = FALSE) head(imp$data) imp$pred_errors # If you even want to keep the random forests of the best iteration imp <- missRanger( - irisWithNA, pmm.k = 3, num.trees = 100, data_only = FALSE, keep_forests = TRUE + irisWithNA, pmm.k = 3, num.trees = 50, data_only = FALSE, keep_forests = TRUE ) imp$forests$Sepal.Width imp$pred_errors[imp$best_iter, "Sepal.Width"] # 1 - R-squared } -} \references{ \enumerate{ \item Wright, M. N. & Ziegler, A. (2016). ranger: A Fast Implementation of diff --git a/man/pmm.Rd b/man/pmm.Rd index 048b04b..d8d6773 100644 --- a/man/pmm.Rd +++ b/man/pmm.Rd @@ -30,9 +30,9 @@ values in the prediction vector \code{xtrain} is randomly chosen and its observe value in \code{ytrain} is returned. } \examples{ -pmm(xtrain = c(0.2, 0.2, 0.8), xtest = 0.3, ytrain = c(0, 0, 1)) # 0 -pmm(xtrain = c(TRUE, FALSE, TRUE), xtest = FALSE, ytrain = c(2, 0, 1)) # 0 -pmm(xtrain = c(0.2, 0.8), xtest = 0.3, ytrain = c("A", "B"), k = 2) # "A" or "B" -pmm(xtrain = c("A", "A", "B"), xtest = "A", ytrain = c(2, 2, 4), k = 2) # 2 -pmm(xtrain = factor(c("A", "B")), xtest = factor("C"), ytrain = 1:2) # 2 +pmm(xtrain = c(0.2, 0.2, 0.8), xtest = 0.3, ytrain = c(0, 0, 1)) +pmm(xtrain = c(TRUE, FALSE, TRUE), xtest = FALSE, ytrain = c(2, 0, 1)) +pmm(xtrain = c(0.2, 0.8), xtest = 0.3, ytrain = c("A", "B"), k = 2) +pmm(xtrain = c("A", "A", "B"), xtest = "A", ytrain = c(2, 2, 4), k = 2) +pmm(xtrain = factor(c("A", "B")), xtest = factor("C"), ytrain = 1:2) } diff --git a/tests/testthat/test-ranger.R b/tests/testthat/test-ranger.R new file mode 100644 index 0000000..4a03637 --- /dev/null +++ b/tests/testthat/test-ranger.R @@ -0,0 +1,27 @@ +test_that("x/y API of ranger does not care about feature order in x at prediction", { + fit <- ranger::ranger(y = iris[[1L]], x = iris[, 2:5], num.trees = 10L) + pred1 <- predict(fit, iris[1:5, 2:5]) + pred2 <- predict(fit, iris[1:5, 5:2]) + expect_equal(pred1, pred2) +}) + +test_that("x/y API of ranger does not care about extra columns during prediction", { + fit <- ranger::ranger(y = iris[[1L]], x = iris[, 2:5], num.trees = 10L) + pred1 <- predict(fit, iris[1:5, 2:5]) + pred2 <- predict(fit, iris[1:5, ]) + expect_equal(pred1, pred2) +}) + + +test_that("ranger can safely predict on character feature", { + ir <- transform(iris, Species = as.character(Species)) + fit <- ranger::ranger(Sepal.Length ~ ., data = ir) + + pred1 <- predict(fit, ir[c(1, 51, 101), ])$predictions + pred2 <- c( + predict(fit, ir[c(1), ])$predictions, + predict(fit, ir[c(51), ])$predictions, + predict(fit, ir[c(101), ])$predictions + ) + expect_equal(pred1, pred2) +}) diff --git a/vignettes/working_with_censoring.Rmd b/vignettes/working_with_censoring.Rmd index a9aeb7d..fc1783e 100644 --- a/vignettes/working_with_censoring.Rmd +++ b/vignettes/working_with_censoring.Rmd @@ -21,7 +21,7 @@ knitr::opts_chunk$set( ## How to deal with censored variables? -There is no obvious way of how to deal with survival variables as covariables in imputation models. +There is no obvious way of how to deal with survival variables as covariates in imputation models. Options discussed in [@white] include: @@ -31,16 +31,16 @@ Options discussed in [@white] include: - $\text{surv}(t)$, and, optionally $s$ -By $\text{surv}(t)$, we denote the Nelson-Aalen survival estimate at each value of $t$. The third option is the most elegant one as it explicitly deals with censoring information. We provide some additional details on it in the example. +By $\text{surv}(t)$, we denote the Nelson-Aalen survival estimate at each value of $t$. The third option seems attractive as it explicitly deals with censoring information. We provide some additional details on it in the example. ### Example -For illustration, we use data from a randomized two-arm trial about lung cancer. The aim is to estimate the treatment effect of "trt" with reliable inference using Cox regression. Unfortunately, we have added missing values in the covariables "age", "karno", and "diagtime". +For illustration, we use data from a randomized two-arm trial about lung cancer. The aim is to estimate the treatment effect of "trt" with reliable inference using Cox regression. We add missing values in the covariates "age", "karno", and "diagtime". -A reasonable way to estimate the covariable adjusted treatment effect is the following: +Let's estimate the covariate adjusted treatment effect using the following steps: 1. Add Nelson-Aalen survival estimates "surv" to the dataset. -2. Use "surv" as well as the covariables to impute missing values in the covariables multiple times. +2. Use "surv" as well as the covariates to impute missing values in the covariates multiple times. 3. Perform the intended Cox regression for each of the imputed data sets. 4. Pool their results by Rubin's rule [@rubin], using package {mice} [@buuren]. @@ -62,14 +62,12 @@ head(veteran) # 6 1 squamous 10 1 20 5 49 0 # 1. Calculate Nelson-Aalen survival probabilities for each time point -nelson_aalen <- summary( - survfit(Surv(time, status) ~ 1, data = veteran), - times = unique(veteran$time) -)[c("time", "surv")] -nelson_aalen <- data.frame(nelson_aalen) +nelson_aalen <- survfit(Surv(time, status) ~ 1, data = veteran) |> + summary(times = unique(veteran$time)) +nelson_aalen <- nelson_aalen[c("time", "surv")] # Add it to the original data set -veteran2 <- merge(veteran, nelson_aalen, all.x = TRUE) +veteran2 <- merge(veteran, nelson_aalen, all.x = TRUE, by = "time") # Add missing values to make things tricky veteran2 <- generateNA(veteran2, p = c(age = 0.1, karno = 0.1, diagtime = 0.1)) @@ -80,7 +78,7 @@ filled <- replicate( missRanger( veteran2, . ~ . - time - status, verbose = 0, - pmm.k = 3, + pmm.k = 10, num.trees = 25 ), simplify = FALSE @@ -92,28 +90,28 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x)) # 4. Pool the results by mice summary(pooled_fit <- pool(models)) -# term estimate std.error statistic df p.value -# 1 trt 0.245855250 0.212810467 1.1552780 108.72929 2.505091e-01 -# 2 celltypesmallcell 0.805233656 0.284424937 2.8310937 114.17088 5.483657e-03 -# 3 celltypeadeno 1.110172771 0.307570269 3.6094931 111.91588 4.603422e-04 -# 4 celltypelarge 0.328227283 0.291163500 1.1272954 109.30510 2.620862e-01 -# 5 karno -0.031838682 0.005663349 -5.6218824 112.60325 1.390333e-07 -# 6 diagtime 0.002775351 0.009382270 0.2958081 86.61582 7.680847e-01 -# 7 age -0.007843577 0.009293988 -0.8439410 107.86917 4.005701e-01 -# 8 prior 0.003165245 0.023501821 0.1346809 111.84783 8.931063e-01 + term estimate std.error statistic df p.value +1 trt 0.253538955 0.212912636 1.1908122 108.60558 2.363235e-01 +2 celltypesmallcell 0.815455716 0.285715626 2.8540816 113.48112 5.132019e-03 +3 celltypeadeno 1.110530627 0.311566058 3.5643505 108.55506 5.436043e-04 +4 celltypelarge 0.346762041 0.288497586 1.2019582 113.27005 2.318866e-01 +5 karno -0.030754195 0.005758037 -5.3410901 99.68585 5.837902e-07 +6 diagtime 0.001835002 0.009233708 0.1987286 107.07024 8.428519e-01 +7 age -0.006032505 0.009447786 -0.6385099 101.88961 5.245745e-01 +8 prior 0.002983767 0.023275501 0.1281935 115.27919 8.982192e-01 # Compare with the results on the original data summary(coxph(Surv(time, status) ~ ., veteran))$coefficients -# coef exp(coef) se(coef) z Pr(>|z|) -# trt 2.946028e-01 1.3425930 0.207549604 1.419433313 1.557727e-01 -# celltypesmallcell 8.615605e-01 2.3668512 0.275284474 3.129709606 1.749792e-03 -# celltypeadeno 1.196066e+00 3.3070825 0.300916994 3.974738536 7.045662e-05 -# celltypelarge 4.012917e-01 1.4937529 0.282688638 1.419553530 1.557377e-01 -# karno -3.281533e-02 0.9677173 0.005507757 -5.958020093 2.553121e-09 -# diagtime 8.132051e-05 1.0000813 0.009136062 0.008901046 9.928981e-01 -# age -8.706475e-03 0.9913313 0.009300299 -0.936149992 3.491960e-01 -# prior 7.159360e-03 1.0071850 0.023230538 0.308187441 7.579397e-01 + coef exp(coef) se(coef) z Pr(>|z|) +trt 2.946028e-01 1.3425930 0.207549604 1.419433313 1.557727e-01 +celltypesmallcell 8.615605e-01 2.3668512 0.275284474 3.129709606 1.749792e-03 +celltypeadeno 1.196066e+00 3.3070825 0.300916994 3.974738536 7.045662e-05 +celltypelarge 4.012917e-01 1.4937529 0.282688638 1.419553530 1.557377e-01 +karno -3.281533e-02 0.9677173 0.005507757 -5.958020093 2.553121e-09 +diagtime 8.132051e-05 1.0000813 0.009136062 0.008901046 9.928981e-01 +age -8.706475e-03 0.9913313 0.009300299 -0.936149992 3.491960e-01 +prior 7.159360e-03 1.0071850 0.023230538 0.308187441 7.579397e-01 ``` ## References